[Likwid-commit] [likwid] 01/04: Imported Upstream version 4.1.0+dfsg1

Christoph Martin chrism at debian.org
Tue Jun 21 10:34:18 UTC 2016


This is an automated email from the git hooks/post-receive script.

chrism pushed a commit to branch upstream
in repository likwid.

commit 2d3370410ca0bd86b781267242f8d33efc2a9da8
Author: Christoph Martin <martin at uni-mainz.de>
Date:   Wed Jun 15 18:18:09 2016 +0200

    Imported Upstream version 4.1.0+dfsg1
---
 .travis.yml                                        |   15 +
 CHANGELOG                                          |   59 +
 INSTALL                                            |  175 +-
 Makefile                                           |  690 ++-
 README                                             |   29 -
 README.md                                          |   58 +
 bench/Makefile                                     |  157 +
 bench/includes/allocator.h                         |   50 +
 bench/includes/allocator_types.h                   |   46 +
 bench/includes/barrier.h                           |   58 +
 bench/includes/barrier_types.h                     |   49 +
 bench/includes/bstrlib.h                           |    1 +
 bench/includes/likwid.h                            |    1 +
 bench/includes/strUtil.h                           |   60 +
 bench/includes/test_types.h                        |  113 +
 bench/includes/threads.h                           |  114 +
 bench/includes/threads_types.h                     |   56 +
 bench/likwid-bench.c                               |  521 ++
 bench/perl/AsmGen.pl                               |  284 ++
 {perl => bench/perl}/Parse/RecDescent.pm           |    0
 {perl => bench/perl}/Template.pm                   |    0
 {perl => bench/perl}/Template/Base.pm              |    0
 {perl => bench/perl}/Template/Config.pm            |    0
 {perl => bench/perl}/Template/Constants.pm         |    0
 {perl => bench/perl}/Template/Context.pm           |    0
 {perl => bench/perl}/Template/Directive.pm         |    0
 {perl => bench/perl}/Template/Document.pm          |    0
 {perl => bench/perl}/Template/Exception.pm         |    0
 {perl => bench/perl}/Template/Filters.pm           |    0
 {perl => bench/perl}/Template/Grammar.pm           |    0
 {perl => bench/perl}/Template/Iterator.pm          |    0
 .../perl}/Template/Namespace/Constants.pm          |    0
 {perl => bench/perl}/Template/Parser.pm            |    0
 {perl => bench/perl}/Template/Plugin.pm            |    0
 {perl => bench/perl}/Template/Plugin/Assert.pm     |    0
 {perl => bench/perl}/Template/Plugin/CGI.pm        |    0
 {perl => bench/perl}/Template/Plugin/Datafile.pm   |    0
 {perl => bench/perl}/Template/Plugin/Date.pm       |    0
 {perl => bench/perl}/Template/Plugin/Directory.pm  |    0
 {perl => bench/perl}/Template/Plugin/Dumper.pm     |    0
 {perl => bench/perl}/Template/Plugin/File.pm       |    0
 {perl => bench/perl}/Template/Plugin/Filter.pm     |    0
 {perl => bench/perl}/Template/Plugin/Format.pm     |    0
 {perl => bench/perl}/Template/Plugin/HTML.pm       |    0
 {perl => bench/perl}/Template/Plugin/Image.pm      |    0
 {perl => bench/perl}/Template/Plugin/Iterator.pm   |    0
 {perl => bench/perl}/Template/Plugin/Math.pm       |    0
 {perl => bench/perl}/Template/Plugin/Pod.pm        |    0
 {perl => bench/perl}/Template/Plugin/Procedural.pm |    0
 {perl => bench/perl}/Template/Plugin/Scalar.pm     |    0
 {perl => bench/perl}/Template/Plugin/String.pm     |    0
 {perl => bench/perl}/Template/Plugin/Table.pm      |    0
 {perl => bench/perl}/Template/Plugin/URL.pm        |    0
 {perl => bench/perl}/Template/Plugin/View.pm       |    0
 {perl => bench/perl}/Template/Plugin/Wrap.pm       |    0
 {perl => bench/perl}/Template/Plugins.pm           |    0
 {perl => bench/perl}/Template/Provider.pm          |    0
 {perl => bench/perl}/Template/Service.pm           |    0
 {perl => bench/perl}/Template/Stash.pm             |    0
 {perl => bench/perl}/Template/Stash/Context.pm     |    0
 {perl => bench/perl}/Template/Stash/XS.pm          |    0
 {perl => bench/perl}/Template/Test.pm              |    0
 {perl => bench/perl}/Template/VMethods.pm          |    0
 {perl => bench/perl}/Template/View.pm              |    0
 bench/perl/gas.pm                                  |  211 +
 bench/perl/generatePas.pl                          |  198 +
 {perl => bench/perl}/isax86.pm                     |    0
 {perl => bench/perl}/isax86_64.pm                  |    0
 {perl => bench/perl}/templates/bench.tt            |    0
 bench/perl/templates/group.tt                      |  157 +
 {perl => bench/perl}/templates/group_types.tt      |    0
 bench/perl/templates/testcases.tt                  |   19 +
 bench/phi/store.ptt                                |    8 +-
 bench/phi/store_mem.ptt                            |    8 +-
 bench/src/allocator.c                              |  209 +
 bench/src/barrier.c                                |  167 +
 bench/src/bench.c                                  |  770 +++
 bench/src/bstrlib.c                                | 2955 +++++++++++
 bench/src/strUtil.c                                |  319 ++
 bench/src/threads.c                                |  293 ++
 bench/x86-64/branch.ptt                            |   36 -
 bench/x86-64/clcopy.ptt                            |    6 +
 bench/x86-64/clload.ptt                            |    6 +
 bench/x86-64/clstore.ptt                           |   14 +-
 bench/x86-64/copy.ptt                              |   24 +-
 bench/x86-64/copy_avx.ptt                          |    6 +
 bench/x86-64/copy_mem.ptt                          |   24 +-
 bench/x86-64/copy_mem_avx.ptt                      |    8 +-
 bench/x86-64/copy_mem_sse.ptt                      |    6 +
 bench/x86-64/copy_plain.ptt                        |   16 -
 bench/x86-64/copy_sse.ptt                          |    6 +
 bench/x86-64/daxpy.ptt                             |   28 +
 bench/x86-64/daxpy_avx.ptt                         |   31 +
 bench/x86-64/daxpy_avx_fma.ptt                     |   25 +
 bench/x86-64/daxpy_mem_avx.ptt                     |   30 +
 bench/x86-64/daxpy_mem_avx_fma.ptt                 |   24 +
 bench/x86-64/daxpy_mem_sse.ptt                     |   28 +
 bench/x86-64/daxpy_mem_sse_fma.ptt                 |   24 +
 bench/x86-64/daxpy_sp.ptt                          |   44 +
 bench/x86-64/daxpy_sp_avx.ptt                      |   19 +
 bench/x86-64/daxpy_sp_avx_fma.ptt                  |   25 +
 bench/x86-64/daxpy_sp_mem_avx.ptt                  |   19 +
 bench/x86-64/daxpy_sp_mem_avx_fma.ptt              |   25 +
 bench/x86-64/daxpy_sp_mem_sse.ptt                  |   20 +
 bench/x86-64/daxpy_sp_mem_sse_fma.ptt              |   24 +
 bench/x86-64/daxpy_sp_sse.ptt                      |   28 +
 bench/x86-64/daxpy_sp_sse_fma.ptt                  |   24 +
 bench/x86-64/daxpy_sse.ptt                         |   28 +
 bench/x86-64/daxpy_sse_fma.ptt                     |   24 +
 bench/x86-64/ddot.ptt                              |   27 +
 bench/x86-64/ddot_avx.ptt                          |   27 +
 bench/x86-64/ddot_sp.ptt                           |   27 +
 bench/x86-64/ddot_sp_avx.ptt                       |   19 +
 bench/x86-64/ddot_sp_sse.ptt                       |   19 +
 bench/x86-64/ddot_sse.ptt                          |   27 +
 bench/x86-64/load.ptt                              |   20 +-
 bench/x86-64/load_avx.ptt                          |    8 +-
 bench/x86-64/load_mem.ptt                          |   15 +
 bench/x86-64/load_plain.ptt                        |   12 -
 bench/x86-64/load_sse.ptt                          |    8 +-
 bench/x86-64/peak.ptt                              |   49 -
 bench/x86-64/peak_avx.ptt                          |   49 -
 bench/x86-64/peak_sse.ptt                          |   49 -
 bench/x86-64/peakflops.ptt                         |   37 -
 bench/x86-64/peakflops_avx.ptt                     |   37 -
 bench/x86-64/peakflops_sse.ptt                     |   37 -
 bench/x86-64/store.ptt                             |   25 +-
 bench/x86-64/store_avx.ptt                         |   15 +-
 bench/x86-64/store_mem.ptt                         |   14 +-
 bench/x86-64/store_mem_avx.ptt                     |   14 +-
 bench/x86-64/store_mem_sse.ptt                     |   14 +-
 bench/x86-64/store_plain.ptt                       |   15 -
 bench/x86-64/store_sse.ptt                         |   15 +-
 bench/x86-64/stream.ptt                            |   42 +-
 bench/x86-64/stream_avx.ptt                        |   49 +-
 bench/x86-64/stream_avx_fma.ptt                    |   24 +
 bench/x86-64/stream_mem.ptt                        |   31 +-
 bench/x86-64/stream_mem_avx.ptt                    |   17 +
 bench/x86-64/stream_mem_avx_fma.ptt                |   24 +
 bench/x86-64/stream_mem_sse.ptt                    |   17 +
 bench/x86-64/stream_mem_sse_fma.ptt                |   15 +
 bench/x86-64/stream_sp.ptt                         |   45 +
 bench/x86-64/stream_sp_avx.ptt                     |   28 +
 bench/x86-64/stream_sp_avx_fma.ptt                 |   24 +
 bench/x86-64/stream_sp_mem_avx.ptt                 |   28 +
 bench/x86-64/stream_sp_mem_avx_fma.ptt             |   24 +
 bench/x86-64/stream_sp_mem_sse.ptt                 |   16 +
 bench/x86-64/stream_sp_mem_sse_fma.ptt             |   15 +
 bench/x86-64/stream_sp_sse.ptt                     |   16 +
 bench/x86-64/stream_sp_sse_fma.ptt                 |   15 +
 bench/x86-64/stream_sse.ptt                        |   29 +
 bench/x86-64/stream_sse_fma.ptt                    |   24 +
 bench/x86-64/striad_avx.ptt                        |   23 -
 bench/x86-64/striad_mem_avx.ptt                    |   11 -
 bench/x86-64/striad_mem_sse.ptt                    |   11 -
 bench/x86-64/striad_plain.ptt                      |   23 -
 bench/x86-64/striad_sse.ptt                        |   23 -
 bench/x86-64/sum.ptt                               |   44 +-
 bench/x86-64/sum_avx.ptt                           |   36 +-
 bench/x86-64/sum_plain.ptt                         |   15 -
 bench/x86-64/sum_sp.ptt                            |   21 +
 bench/x86-64/sum_sp_avx.ptt                        |   20 +
 bench/x86-64/sum_sp_sse.ptt                        |   29 +
 bench/x86-64/sum_sse.ptt                           |    6 +
 bench/x86-64/triad.ptt                             |   40 +-
 bench/x86-64/triad_avx.ptt                         |   32 +-
 bench/x86-64/triad_avx_fma.ptt                     |   27 +
 bench/x86-64/triad_mem.ptt                         |   10 -
 bench/x86-64/triad_mem_avx.ptt                     |   18 +
 bench/x86-64/triad_mem_avx_fma.ptt                 |   20 +
 bench/x86-64/triad_mem_sse.ptt                     |   27 +
 bench/x86-64/triad_mem_sse_fma.ptt                 |   27 +
 bench/x86-64/triad_sp.ptt                          |   43 +
 bench/x86-64/triad_sp_avx.ptt                      |   18 +
 bench/x86-64/triad_sp_avx_fma.ptt                  |   16 +
 bench/x86-64/triad_sp_mem_avx.ptt                  |   16 +
 bench/x86-64/triad_sp_mem_avx_fma.ptt              |   16 +
 bench/x86-64/triad_sp_mem_sse.ptt                  |   27 +
 bench/x86-64/triad_sp_mem_sse_fma.ptt              |   27 +
 bench/x86-64/triad_sp_sse.ptt                      |   27 +
 bench/x86-64/triad_sp_sse_fma.ptt                  |   27 +
 bench/x86-64/triad_split.ptt                       |   30 -
 bench/x86-64/triad_sse.ptt                         |   28 +
 bench/x86-64/triad_sse_fma.ptt                     |   27 +
 bench/x86-64/update.ptt                            |   24 +-
 bench/x86-64/update_avx.ptt                        |    6 +
 bench/x86-64/update_plain.ptt                      |   15 -
 bench/x86-64/update_sse.ptt                        |    6 +
 bench/x86-64/vtriad_avx.ptt                        |   22 -
 bench/x86-64/vtriad_mem_avx.ptt                    |   10 -
 bench/x86-64/vtriad_mem_sse.ptt                    |   10 -
 bench/x86-64/vtriad_plain.ptt                      |   22 -
 bench/x86-64/vtriad_sse.ptt                        |   22 -
 bench/x86/copy.ptt                                 |   24 +-
 bench/x86/load.ptt                                 |   19 +-
 bench/x86/store.ptt                                |   24 +-
 bench/x86/stream.ptt                               |   42 +-
 config.mk                                          |   59 +-
 doc/Doxyfile                                       | 1781 +++++++
 doc/applications/likwid-accessD.md                 |   55 +
 doc/applications/likwid-agent.md                   |   94 +
 doc/applications/likwid-bench.md                   |   93 +
 doc/applications/likwid-genTopoCfg.md              |   29 +
 doc/applications/likwid-memsweeper.md              |   34 +
 doc/applications/likwid-mpirun.md                  |   83 +
 doc/applications/likwid-perfctr.md                 |  260 +
 doc/applications/likwid-perfscope.md               |  107 +
 doc/applications/likwid-pin.md                     |  170 +
 doc/applications/likwid-powermeter.md              |   75 +
 doc/applications/likwid-setFreq.md                 |   13 +
 doc/applications/likwid-setFrequencies.md          |   50 +
 doc/applications/likwid-topology.md                |   68 +
 doc/archs/atom.md                                  |  104 +
 doc/archs/broadwell.md                             |  203 +
 doc/archs/core2.md                                 |  103 +
 doc/archs/haswell.md                               |  203 +
 doc/archs/haswellep.md                             |  896 ++++
 doc/archs/interlagos.md                            |  107 +
 doc/archs/ivybridge.md                             |  190 +
 doc/archs/ivybridgeep.md                           |  790 +++
 doc/archs/k10.md                                   |   68 +
 doc/archs/k8.md                                    |   68 +
 doc/archs/kabini.md                                |  162 +
 doc/archs/nehalem.md                               |  237 +
 doc/archs/nehalemex.md                             |  554 +++
 doc/archs/pentiumm.md                              |   63 +
 doc/archs/phi.md                                   |   78 +
 doc/archs/sandybridge.md                           |  189 +
 doc/archs/sandybridgeep.md                         |  775 +++
 doc/archs/silvermont.md                            |  175 +
 doc/archs/westmere.md                              |  239 +
 doc/archs/westmereex.md                            |  555 +++
 doc/bstrlib.txt                                    | 3201 ++++++++++++
 doc/likwid-accessD.1                               |   10 +-
 doc/likwid-agent.1                                 |   94 +
 doc/likwid-bench.1                                 |  145 +-
 doc/likwid-doxygen.md                              |  262 +
 doc/likwid-features.1                              |   54 +-
 doc/likwid-genCfg.1                                |   30 -
 doc/likwid-genTopoCfg.1                            |   30 +
 doc/likwid-lua.1                                   |  111 +
 doc/likwid-memsweeper.1                            |   16 +-
 doc/likwid-mpirun.1                                |   77 +-
 doc/likwid-perfctr.1                               |  260 +-
 doc/likwid-perfscope.1                             |  178 +-
 doc/likwid-pin.1                                   |  226 +-
 doc/likwid-powermeter.1                            |   69 +-
 doc/likwid-setFreq.1                               |   10 +-
 doc/likwid-setFrequencies.1                        |   40 +-
 doc/likwid-topology.1                              |   46 +-
 doc/likwid.cfg.md                                  |   38 +
 doc/logo.png                                       |  Bin 0 -> 6776 bytes
 doc/lua-doxygen.md                                 | 2592 ++++++++++
 examples/C-internalMarkerAPI.c                     |  152 +
 examples/C-likwidAPI.c                             |  149 +
 examples/C-markerAPI.c                             |   87 +
 examples/F-markerAPI.F90                           |   79 +
 examples/Lua-likwidAPI.lua                         |   93 +
 examples/Makefile                                  |   64 +
 examples/monitoring.c                              |  118 +
 ext/hwloc/AUTHORS                                  |    8 +
 ext/hwloc/COPYING                                  |   28 +
 ext/hwloc/Makefile                                 |   73 +
 ext/hwloc/hwloc/base64.c                           |  306 ++
 ext/hwloc/hwloc/bind.c                             |  781 +++
 ext/hwloc/hwloc/bitmap.c                           | 1492 ++++++
 ext/hwloc/hwloc/components.c                       |  792 +++
 ext/hwloc/hwloc/diff.c                             |  426 ++
 ext/hwloc/hwloc/distances.c                        |  995 ++++
 ext/hwloc/hwloc/dolib.c                            |   47 +
 ext/hwloc/hwloc/misc.c                             |  166 +
 ext/hwloc/hwloc/pci-common.c                       |  482 ++
 ext/hwloc/hwloc/topology-bgq.cb                    |  246 +
 ext/hwloc/hwloc/topology-darwin.cb                 |  307 ++
 ext/hwloc/hwloc/topology-fake.c                    |   61 +
 ext/hwloc/hwloc/topology-freebsd.cb                |  255 +
 ext/hwloc/hwloc/topology-linux.c                   | 5133 ++++++++++++++++++++
 ext/hwloc/hwloc/topology-noos.c                    |   58 +
 ext/hwloc/hwloc/topology-opencl.cb                 |  346 ++
 ext/hwloc/hwloc/topology-osf.cb                    |  392 ++
 ext/hwloc/hwloc/topology-synthetic.c               | 1128 +++++
 ext/hwloc/hwloc/topology-x86.c                     | 1386 ++++++
 ext/hwloc/hwloc/topology.c                         | 3436 +++++++++++++
 ext/hwloc/hwloc/traversal.c                        |  701 +++
 ext/hwloc/include/hwloc.h                          | 2206 +++++++++
 ext/hwloc/include/hwloc/autogen/config.h           |  202 +
 ext/hwloc/include/hwloc/autogen/config.h.in        |  201 +
 ext/hwloc/include/hwloc/autogen/stamp-h2           |    1 +
 ext/hwloc/include/hwloc/bitmap.h                   |  359 ++
 ext/hwloc/include/hwloc/cuda.h                     |  224 +
 ext/hwloc/include/hwloc/cudart.h                   |  184 +
 ext/hwloc/include/hwloc/deprecated.h               |  114 +
 ext/hwloc/include/hwloc/diff.h                     |  299 ++
 ext/hwloc/include/hwloc/export.h                   |  221 +
 ext/hwloc/include/hwloc/gl.h                       |  135 +
 ext/hwloc/include/hwloc/glibc-sched.h              |  125 +
 ext/hwloc/include/hwloc/helper.h                   | 1249 +++++
 ext/hwloc/include/hwloc/inlines.h                  |  154 +
 ext/hwloc/include/hwloc/intel-mic.h                |  143 +
 ext/hwloc/include/hwloc/linux-libnuma.h            |  273 ++
 ext/hwloc/include/hwloc/linux.h                    |   77 +
 ext/hwloc/include/hwloc/myriexpress.h              |  127 +
 ext/hwloc/include/hwloc/nvml.h                     |  176 +
 ext/hwloc/include/hwloc/opencl.h                   |  199 +
 ext/hwloc/include/hwloc/openfabrics-verbs.h        |  155 +
 ext/hwloc/include/hwloc/plugins.h                  |  433 ++
 ext/hwloc/include/hwloc/rename.h                   |  651 +++
 ext/hwloc/include/numa.h                           |  468 ++
 ext/hwloc/include/pci/config.h                     |   16 +
 ext/hwloc/include/pci/header.h                     | 1195 +++++
 ext/hwloc/include/pci/pci.h                        |  240 +
 ext/hwloc/include/pci/types.h                      |   65 +
 ext/hwloc/include/private/autogen/README.txt       |    3 +
 ext/hwloc/include/private/autogen/config.h         |  772 +++
 ext/hwloc/include/private/components.h             |   40 +
 ext/hwloc/include/private/cpuid-x86.h              |   89 +
 ext/hwloc/include/private/cpuid.h                  |   80 +
 ext/hwloc/include/private/debug.h                  |   57 +
 ext/hwloc/include/private/map.h                    |  110 +
 ext/hwloc/include/private/misc.h                   |  382 ++
 ext/hwloc/include/private/private.h                |  335 ++
 ext/hwloc/include/private/solaris-chiptype.h       |   59 +
 ext/hwloc/include/private/xml.h                    |   98 +
 ext/hwloc/include/static-components.h              |   17 +
 filters/csv                                        |  114 -
 filters/xml                                        |  184 +-
 groups/atom/BRANCH.txt                             |   16 +-
 groups/atom/DATA.txt                               |   12 +-
 groups/atom/FLOPS_DP.txt                           |    6 +-
 groups/atom/FLOPS_SP.txt                           |    6 +-
 groups/atom/FLOPS_X87.txt                          |    6 +-
 groups/atom/MEM.txt                                |   12 +-
 groups/atom/TLB.txt                                |    3 +-
 groups/broadwell/BRANCH.txt                        |   31 +
 groups/broadwell/CLOCK.txt                         |   23 +
 groups/broadwell/DATA.txt                          |   22 +
 groups/broadwell/ENERGY.txt                        |   39 +
 groups/broadwell/FALSE_SHARE.txt                   |   25 +
 groups/broadwell/FLOPS_AVX.txt                     |   24 +
 groups/broadwell/FLOPS_DP.txt                      |   29 +
 groups/broadwell/FLOPS_SP.txt                      |   29 +
 groups/broadwell/ICACHE.txt                        |   25 +
 groups/broadwell/L2.txt                            |   37 +
 groups/broadwell/L2CACHE.txt                       |   34 +
 groups/broadwell/L3.txt                            |   36 +
 groups/broadwell/L3CACHE.txt                       |   35 +
 groups/broadwell/RECOVERY.txt                      |   22 +
 groups/broadwell/TLB_DATA.txt                      |   35 +
 groups/broadwell/TLB_INSTR.txt                     |   28 +
 groups/broadwellD/BRANCH.txt                       |   31 +
 groups/broadwellD/CACHES.txt                       |  123 +
 groups/broadwellD/CLOCK.txt                        |   23 +
 groups/broadwellD/DATA.txt                         |   22 +
 groups/broadwellD/ENERGY.txt                       |   39 +
 groups/broadwellD/FALSE_SHARE.txt                  |   25 +
 groups/broadwellD/FLOPS_AVX.txt                    |   24 +
 groups/broadwellD/FLOPS_DP.txt                     |   29 +
 groups/broadwellD/FLOPS_SP.txt                     |   29 +
 groups/broadwellD/HA.txt                           |   40 +
 groups/broadwellD/ICACHE.txt                       |   25 +
 groups/broadwellD/L2.txt                           |   37 +
 groups/broadwellD/L2CACHE.txt                      |   34 +
 groups/broadwellD/L3.txt                           |   36 +
 groups/broadwellD/L3CACHE.txt                      |   35 +
 groups/broadwellD/MEM.txt                          |   52 +
 groups/broadwellD/MEM_DP.txt                       |   66 +
 groups/broadwellD/MEM_SP.txt                       |   68 +
 groups/broadwellD/RECOVERY.txt                     |   22 +
 groups/broadwellD/TLB_DATA.txt                     |   35 +
 groups/broadwellD/TLB_INSTR.txt                    |   28 +
 groups/broadwellEP/BRANCH.txt                      |   31 +
 groups/broadwellEP/CACHES.txt                      |  123 +
 groups/broadwellEP/CLOCK.txt                       |   23 +
 groups/broadwellEP/DATA.txt                        |   22 +
 groups/broadwellEP/ENERGY.txt                      |   35 +
 groups/broadwellEP/FALSE_SHARE.txt                 |   29 +
 groups/broadwellEP/FLOPS_AVX.txt                   |   24 +
 groups/broadwellEP/FLOPS_DP.txt                    |   29 +
 groups/broadwellEP/FLOPS_SP.txt                    |   29 +
 groups/broadwellEP/HA.txt                          |   40 +
 groups/broadwellEP/ICACHE.txt                      |   25 +
 groups/broadwellEP/L2.txt                          |   37 +
 groups/broadwellEP/L2CACHE.txt                     |   34 +
 groups/broadwellEP/L3.txt                          |   36 +
 groups/broadwellEP/L3CACHE.txt                     |   35 +
 groups/broadwellEP/MEM.txt                         |   52 +
 groups/broadwellEP/MEM_DP.txt                      |   66 +
 groups/broadwellEP/MEM_SP.txt                      |   68 +
 groups/broadwellEP/NUMA.txt                        |   41 +
 groups/broadwellEP/QPI.txt                         |   49 +
 groups/broadwellEP/TLB_DATA.txt                    |   35 +
 groups/broadwellEP/TLB_INSTR.txt                   |   28 +
 groups/core2/BRANCH.txt                            |   16 +-
 groups/core2/CACHE.txt                             |   29 +-
 groups/core2/CLOCK.txt                             |   19 +
 groups/core2/DATA.txt                              |    6 +-
 groups/core2/FLOPS_DP.txt                          |    9 +-
 groups/core2/FLOPS_SP.txt                          |    9 +-
 groups/core2/FLOPS_X87.txt                         |    9 +-
 groups/core2/L2.txt                                |   19 +-
 groups/core2/L2CACHE.txt                           |   13 +-
 groups/core2/MEM.txt                               |    9 +-
 groups/core2/TLB.txt                               |    9 +-
 groups/core2/UOPS.txt                              |   22 +
 groups/core2/UOPS_RETIRE.txt                       |   25 +
 groups/haswell/BRANCH.txt                          |   14 +-
 groups/haswell/CACHES.txt                          |   71 +
 groups/haswell/CLOCK.txt                           |    2 +-
 groups/haswell/DATA.txt                            |   13 +-
 groups/haswell/ENERGY.txt                          |   11 +-
 groups/haswell/FALSE_SHARE.txt                     |   28 +
 groups/haswell/FLOPS_AVX.txt                       |   28 +
 groups/haswell/ICACHE.txt                          |   14 +-
 groups/haswell/L2.txt                              |   20 +-
 groups/haswell/L2CACHE.txt                         |   21 +-
 groups/haswell/L3.txt                              |   22 +-
 groups/haswell/L3CACHE.txt                         |   24 +-
 groups/haswell/RECOVERY.txt                        |   22 +
 groups/haswell/TLB_DATA.txt                        |   20 +-
 groups/haswell/TLB_INSTR.txt                       |   10 +-
 groups/haswell/UOPS.txt                            |   35 +
 groups/haswell/UOPS_EXEC.txt                       |   31 +
 groups/haswell/UOPS_ISSUE.txt                      |   31 +
 groups/haswell/UOPS_RETIRE.txt                     |   31 +
 groups/haswellEP/BRANCH.txt                        |   31 +
 groups/haswellEP/CACHES.txt                        |  123 +
 groups/haswellEP/CBOX.txt                          |   61 +
 groups/haswellEP/CLOCK.txt                         |   23 +
 groups/haswellEP/DATA.txt                          |   22 +
 groups/haswellEP/ENERGY.txt                        |   35 +
 groups/haswellEP/FALSE_SHARE.txt                   |   34 +
 groups/haswellEP/FLOPS_AVX.txt                     |   28 +
 groups/haswellEP/HA.txt                            |   40 +
 groups/haswellEP/ICACHE.txt                        |   33 +
 groups/haswellEP/L2.txt                            |   37 +
 groups/haswellEP/L2CACHE.txt                       |   34 +
 groups/haswellEP/L3.txt                            |   36 +
 groups/haswellEP/L3CACHE.txt                       |   35 +
 groups/haswellEP/MEM.txt                           |   52 +
 groups/haswellEP/NUMA.txt                          |   33 +
 groups/haswellEP/QPI.txt                           |   49 +
 groups/haswellEP/RECOVERY.txt                      |   22 +
 groups/haswellEP/SBOX.txt                          |   28 +
 groups/haswellEP/TLB_DATA.txt                      |   35 +
 groups/haswellEP/TLB_INSTR.txt                     |   28 +
 groups/haswellEP/UOPS.txt                          |   35 +
 groups/haswellEP/UOPS_EXEC.txt                     |   31 +
 groups/haswellEP/UOPS_ISSUE.txt                    |   31 +
 groups/haswellEP/UOPS_RETIRE.txt                   |   31 +
 groups/interlagos/BRANCH.txt                       |   18 +-
 groups/interlagos/CACHE.txt                        |   30 +-
 groups/interlagos/CPI.txt                          |    5 +
 groups/interlagos/DATA.txt                         |    4 +-
 groups/interlagos/FLOPS_DP.txt                     |    8 +-
 groups/interlagos/FLOPS_SP.txt                     |    8 +-
 groups/interlagos/FPU_EXCEPTION.txt                |    2 +-
 groups/interlagos/ICACHE.txt                       |   16 +-
 groups/interlagos/L2.txt                           |   12 +-
 groups/interlagos/L2CACHE.txt                      |   18 +-
 groups/interlagos/L3.txt                           |   23 +-
 groups/interlagos/L3CACHE.txt                      |   18 +-
 groups/interlagos/LINKS.txt                        |    4 +-
 groups/interlagos/MEM.txt                          |    2 +-
 groups/interlagos/NUMA.txt                         |    4 +-
 groups/interlagos/NUMA_0_3.txt                     |   28 +
 groups/interlagos/NUMA_4_7.txt                     |   28 +
 groups/ivybridge/BRANCH.txt                        |   14 +-
 groups/ivybridge/CLOCK.txt                         |    2 +-
 groups/ivybridge/DATA.txt                          |    8 +-
 groups/ivybridge/ENERGY.txt                        |   12 +-
 groups/ivybridge/FALSE_SHARE.txt                   |   25 +
 groups/ivybridge/FLOPS_AVX.txt                     |   12 +-
 groups/ivybridge/FLOPS_DP.txt                      |   17 +-
 groups/ivybridge/FLOPS_SP.txt                      |   18 +-
 groups/ivybridge/ICACHE.txt                        |   14 +-
 groups/ivybridge/L2.txt                            |   28 +-
 groups/ivybridge/L2CACHE.txt                       |   19 +-
 groups/ivybridge/L3.txt                            |   22 +-
 groups/ivybridge/L3CACHE.txt                       |   25 +-
 groups/ivybridge/MEM.txt                           |   32 -
 groups/ivybridge/MEM_DP.txt                        |   57 -
 groups/ivybridge/MEM_SP.txt                        |   57 -
 groups/ivybridge/RECOVERY.txt                      |   22 +
 groups/ivybridge/TLB_DATA.txt                      |   20 +-
 groups/ivybridge/TLB_INSTR.txt                     |   10 +-
 groups/ivybridge/UOPS.txt                          |   35 +
 groups/ivybridge/UOPS_EXEC.txt                     |   31 +
 groups/ivybridge/UOPS_ISSUE.txt                    |   31 +
 groups/ivybridge/UOPS_RETIRE.txt                   |   31 +
 groups/ivybridgeEP/BRANCH.txt                      |   31 +
 groups/ivybridgeEP/CACHES.txt                      |  121 +
 groups/ivybridgeEP/CBOX.txt                        |   55 +
 groups/ivybridgeEP/CLOCK.txt                       |   23 +
 groups/ivybridgeEP/DATA.txt                        |   22 +
 groups/ivybridgeEP/ENERGY.txt                      |   33 +
 groups/ivybridgeEP/FALSE_SHARE.txt                 |   32 +
 groups/ivybridgeEP/FLOPS_AVX.txt                   |   26 +
 groups/ivybridgeEP/FLOPS_DP.txt                    |   31 +
 groups/ivybridgeEP/FLOPS_SP.txt                    |   31 +
 groups/ivybridgeEP/ICACHE.txt                      |   33 +
 groups/ivybridgeEP/L2.txt                          |   38 +
 groups/ivybridgeEP/L2CACHE.txt                     |   34 +
 groups/ivybridgeEP/L3.txt                          |   36 +
 groups/ivybridgeEP/L3CACHE.txt                     |   36 +
 groups/ivybridgeEP/MEM.txt                         |   49 +
 groups/ivybridgeEP/MEM_DP.txt                      |   68 +
 groups/ivybridgeEP/MEM_SP.txt                      |   70 +
 groups/ivybridgeEP/NUMA.txt                        |   33 +
 groups/ivybridgeEP/QPI.txt                         |   52 +
 groups/ivybridgeEP/RECOVERY.txt                    |   22 +
 groups/ivybridgeEP/TLB_DATA.txt                    |   35 +
 groups/ivybridgeEP/TLB_INSTR.txt                   |   28 +
 groups/ivybridgeEP/UNCORECLOCK.txt                 |   84 +
 groups/ivybridgeEP/UOPS.txt                        |   35 +
 groups/ivybridgeEP/UOPS_EXEC.txt                   |   31 +
 groups/ivybridgeEP/UOPS_ISSUE.txt                  |   31 +
 groups/ivybridgeEP/UOPS_RETIRE.txt                 |   31 +
 groups/k10/BRANCH.txt                              |   20 +-
 groups/k10/CACHE.txt                               |   30 +-
 groups/k10/CPI.txt                                 |    5 +
 groups/k10/FLOPS_DP.txt                            |   16 +-
 groups/k10/FLOPS_SP.txt                            |   16 +-
 groups/k10/FLOPS_X87.txt                           |   18 +-
 groups/k10/FPU_EXCEPTION.txt                       |    2 +-
 groups/k10/ICACHE.txt                              |   16 +-
 groups/k10/L2.txt                                  |   20 +-
 groups/k10/L2CACHE.txt                             |   12 +-
 groups/k10/L3CACHE.txt                             |   20 +-
 groups/k10/MEM.txt                                 |   19 +-
 groups/k10/NUMA.txt                                |   25 -
 groups/k10/NUMA2.txt                               |   24 -
 groups/k10/NUMA_0_3.txt                            |   27 +
 groups/k10/NUMA_4_7.txt                            |   27 +
 groups/k10/TLB.txt                                 |    6 +-
 groups/k8/BRANCH.txt                               |   20 +-
 groups/k8/CACHE.txt                                |   30 +-
 groups/k8/CPI.txt                                  |    5 +
 groups/k8/ICACHE.txt                               |   16 +-
 groups/k8/L2.txt                                   |    4 +-
 groups/kabini/BRANCH.txt                           |   18 +-
 groups/kabini/CACHE.txt                            |   30 +-
 groups/kabini/CPI.txt                              |    5 +
 groups/kabini/DATA.txt                             |    4 +-
 groups/kabini/FLOPS_DP.txt                         |   11 +-
 groups/kabini/FLOPS_SP.txt                         |   11 +-
 groups/kabini/FPU_EXCEPTION.txt                    |    2 +-
 groups/kabini/ICACHE.txt                           |   16 +-
 groups/kabini/L2.txt                               |   20 +-
 groups/kabini/MEM.txt                              |    2 +-
 groups/kabini/NUMA.txt                             |   28 -
 groups/kabini/NUMA2.txt                            |   28 -
 groups/kabini/NUMA_0_3.txt                         |   28 +
 groups/kabini/NUMA_4_7.txt                         |   28 +
 groups/kabini/TLB.txt                              |    9 +-
 groups/nehalem/BRANCH.txt                          |   14 +-
 groups/nehalem/CACHE.txt                           |   29 +-
 groups/nehalem/DATA.txt                            |    6 +-
 groups/nehalem/FLOPS_DP.txt                        |   16 +-
 groups/nehalem/FLOPS_SP.txt                        |   16 +-
 groups/nehalem/FLOPS_X87.txt                       |    6 +-
 groups/nehalem/ICACHE.txt                          |   25 +
 groups/nehalem/L2.txt                              |   30 +-
 groups/nehalem/L2CACHE.txt                         |   20 +-
 groups/nehalem/L3.txt                              |   18 +-
 groups/nehalem/L3CACHE.txt                         |   30 +-
 groups/nehalem/MEM.txt                             |   49 +-
 groups/nehalem/SCHEDULER.txt                       |    8 +-
 groups/nehalem/TLB.txt                             |   10 +-
 groups/nehalem/VIEW.txt                            |   50 -
 groups/nehalemEX/BRANCH.txt                        |   14 +-
 groups/nehalemEX/CACHE.txt                         |   29 +-
 groups/nehalemEX/DATA.txt                          |    6 +-
 groups/nehalemEX/FLOPS_DP.txt                      |   16 +-
 groups/nehalemEX/FLOPS_SP.txt                      |   16 +-
 groups/nehalemEX/FLOPS_X87.txt                     |    6 +-
 groups/nehalemEX/ICACHE.txt                        |   25 +
 groups/nehalemEX/L2.txt                            |   31 +-
 groups/nehalemEX/L2CACHE.txt                       |   21 +-
 groups/nehalemEX/L3.txt                            |   37 +
 groups/nehalemEX/L3CACHE.txt                       |   48 +
 groups/nehalemEX/MEM.txt                           |   53 +-
 groups/nehalemEX/SCHEDULER.txt                     |    8 +-
 groups/nehalemEX/TLB.txt                           |    8 +-
 groups/pentiumm/BRANCH.txt                         |   17 +
 groups/pentiumm/CPI.txt                            |   22 +
 groups/pentiumm/FLOPS_DP.txt                       |   20 +
 groups/pentiumm/FLOPS_SP.txt                       |   18 +
 groups/pentiumm/L3.txt                             |   30 +
 groups/phi/CACHE.txt                               |   15 +-
 groups/phi/COMPUTE_TO_DATA_RATIO.txt               |   22 +
 groups/phi/CPI.txt                                 |    4 +
 groups/phi/L2CACHE.txt                             |   19 -
 groups/phi/MEM.txt                                 |   18 +
 groups/phi/MEM1.txt                                |   13 +-
 groups/phi/MEM2.txt                                |   12 +-
 groups/phi/MEM3.txt                                |   10 +-
 groups/phi/MEM4.txt                                |   12 +-
 groups/phi/MEM5.txt                                |   14 +-
 groups/phi/MEM6.txt                                |   12 +-
 groups/phi/MEM_READ.txt                            |   20 +
 groups/phi/MEM_WRITE.txt                           |   20 +
 groups/phi/PAIRING.txt                             |   14 +-
 groups/phi/READ_MISS_RATIO.txt                     |    9 +-
 groups/phi/TLB.txt                                 |   23 +
 groups/phi/TLB_L1.txt                              |   23 +
 groups/phi/TLB_L2.txt                              |   21 +
 groups/phi/VECTOR.txt                              |   10 +-
 groups/phi/VECTOR2.txt                             |   10 +-
 groups/phi/VPU_FILL_RATIO_DBL.txt                  |   12 +-
 groups/phi/VPU_PAIRING.txt                         |   15 +-
 groups/phi/VPU_READ_MISS_RATIO.txt                 |   10 +-
 groups/phi/VPU_WRITE_MISS_RATIO.txt                |   10 +-
 groups/phi/WRITE_MISS_RATIO.txt                    |    9 +-
 groups/sandybridge/BRANCH.txt                      |   14 +-
 groups/sandybridge/CLOCK.txt                       |    2 +-
 groups/sandybridge/DATA.txt                        |    8 +-
 groups/sandybridge/ENERGY.txt                      |   14 +-
 groups/sandybridge/FALSE_SHARE.txt                 |   25 +
 groups/sandybridge/FLOPS_AVX.txt                   |   13 +-
 groups/sandybridge/FLOPS_DP.txt                    |   18 +-
 groups/sandybridge/FLOPS_SP.txt                    |   18 +-
 groups/sandybridge/ICACHE.txt                      |   33 +
 groups/sandybridge/L2.txt                          |   28 +-
 groups/sandybridge/L2CACHE.txt                     |   19 +-
 groups/sandybridge/L3.txt                          |   22 +-
 groups/sandybridge/L3CACHE.txt                     |   24 +-
 groups/sandybridge/MEM.txt                         |   32 -
 groups/sandybridge/MEM_DP.txt                      |   55 -
 groups/sandybridge/MEM_SP.txt                      |   56 -
 groups/sandybridge/RECOVERY.txt                    |   22 +
 groups/sandybridge/TLB_DATA.txt                    |   20 +-
 groups/sandybridge/TLB_INSTR.txt                   |   10 +-
 groups/sandybridge/UOPS.txt                        |   35 +
 groups/sandybridge/UOPS_EXEC.txt                   |   31 +
 groups/sandybridge/UOPS_ISSUE.txt                  |   31 +
 groups/sandybridge/UOPS_RETIRE.txt                 |   31 +
 groups/sandybridgeEP/BRANCH.txt                    |   31 +
 groups/sandybridgeEP/CACHES.txt                    |   97 +
 groups/sandybridgeEP/CLOCK.txt                     |   27 +
 groups/sandybridgeEP/DATA.txt                      |   22 +
 groups/sandybridgeEP/ENERGY.txt                    |   33 +
 groups/sandybridgeEP/FALSE_SHARE.txt               |   27 +
 groups/sandybridgeEP/FLOPS_AVX.txt                 |   26 +
 groups/sandybridgeEP/FLOPS_DP.txt                  |   31 +
 groups/sandybridgeEP/FLOPS_SP.txt                  |   31 +
 groups/sandybridgeEP/ICACHE.txt                    |   33 +
 groups/sandybridgeEP/L2.txt                        |   38 +
 groups/sandybridgeEP/L2CACHE.txt                   |   34 +
 groups/sandybridgeEP/L3.txt                        |   36 +
 groups/sandybridgeEP/L3CACHE.txt                   |   36 +
 groups/sandybridgeEP/MEM.txt                       |   40 +
 groups/sandybridgeEP/MEM_DP.txt                    |   59 +
 groups/sandybridgeEP/MEM_SP.txt                    |   61 +
 groups/sandybridgeEP/NUMA.txt                      |   33 +
 groups/sandybridgeEP/QPI.txt                       |   35 +
 groups/sandybridgeEP/RECOVERY.txt                  |   22 +
 groups/sandybridgeEP/TLB_DATA.txt                  |   35 +
 groups/sandybridgeEP/TLB_INSTR.txt                 |   28 +
 groups/sandybridgeEP/UOPS.txt                      |   35 +
 groups/sandybridgeEP/UOPS_EXEC.txt                 |   31 +
 groups/sandybridgeEP/UOPS_ISSUE.txt                |   31 +
 groups/sandybridgeEP/UOPS_RETIRE.txt               |   31 +
 groups/silvermont/BRANCH.txt                       |   14 +-
 groups/silvermont/CLOCK.txt                        |   23 +
 groups/silvermont/DATA.txt                         |   22 +
 groups/silvermont/ENERGY.txt                       |    6 +-
 groups/silvermont/ICACHE.txt                       |    6 +-
 groups/silvermont/L1TOL2.txt                       |   28 -
 groups/silvermont/L2CACHE.txt                      |   34 +
 groups/silvermont/L2TOMEM.txt                      |   26 -
 groups/silvermont/MEM.txt                          |   37 +
 groups/silvermont/MEM_LAT.txt                      |   23 +
 groups/silvermont/TLB_DATA.txt                     |   27 +
 groups/silvermont/TLB_INSTR.txt                    |   27 +
 groups/skylake/BRANCH.txt                          |   31 +
 groups/skylake/CLOCK.txt                           |   27 +
 groups/skylake/DATA.txt                            |   22 +
 groups/skylake/ENERGY.txt                          |   39 +
 groups/skylake/FALSE_SHARE.txt                     |   25 +
 groups/skylake/FLOPS_AVX.txt                       |   24 +
 groups/skylake/FLOPS_DP.txt                        |   29 +
 groups/skylake/FLOPS_SP.txt                        |   29 +
 groups/skylake/ICACHE.txt                          |   30 +
 groups/skylake/L2.txt                              |   38 +
 groups/skylake/L2CACHE.txt                         |   34 +
 groups/skylake/L3.txt                              |   36 +
 groups/skylake/L3CACHE.txt                         |   35 +
 groups/skylake/RECOVERY.txt                        |   22 +
 groups/skylake/TLB_DATA.txt                        |   35 +
 groups/skylake/TLB_INSTR.txt                       |   28 +
 groups/skylake/UOPS.txt                            |   29 +
 groups/skylake/UOPS_EXEC.txt                       |   31 +
 groups/skylake/UOPS_ISSUE.txt                      |   31 +
 groups/skylake/UOPS_RETIRE.txt                     |   31 +
 groups/westmere/BRANCH.txt                         |   16 +-
 groups/westmere/CACHE.txt                          |   13 +-
 groups/westmere/CLOCK.txt                          |   18 +
 groups/westmere/DATA.txt                           |    6 +-
 groups/westmere/FLOPS_DP.txt                       |   14 +-
 groups/westmere/FLOPS_SP.txt                       |   14 +-
 groups/westmere/FLOPS_X87.txt                      |    6 +-
 groups/westmere/ICACHE.txt                         |   25 +
 groups/westmere/L2.txt                             |   28 +-
 groups/westmere/L2CACHE.txt                        |   21 +-
 groups/westmere/L3.txt                             |   23 +-
 groups/westmere/L3CACHE.txt                        |   26 +-
 groups/westmere/MEM.txt                            |   53 +-
 groups/westmere/TLB.txt                            |   22 -
 groups/westmere/TLB_DATA.txt                       |   35 +
 groups/westmere/TLB_INSTR.txt                      |   27 +
 groups/westmere/UOPS.txt                           |   35 +
 groups/westmere/VIEW.txt                           |   14 +-
 groups/westmereEX/BRANCH.txt                       |   16 +-
 groups/westmereEX/CACHE.txt                        |   11 +-
 groups/westmereEX/DATA.txt                         |    6 +-
 groups/westmereEX/FLOPS_DP.txt                     |   16 +-
 groups/westmereEX/FLOPS_SP.txt                     |   16 +-
 groups/westmereEX/FLOPS_X87.txt                    |    6 +-
 groups/westmereEX/ICACHE.txt                       |   25 +
 groups/westmereEX/L2.txt                           |   28 +-
 groups/westmereEX/L2CACHE.txt                      |   21 +-
 groups/westmereEX/L3.txt                           |   20 +-
 groups/westmereEX/L3CACHE.txt                      |   52 +
 groups/westmereEX/MEM.txt                          |   49 +-
 groups/westmereEX/NUMA.txt                         |   33 +
 groups/westmereEX/TLB.txt                          |   22 -
 groups/westmereEX/TLB_DATA.txt                     |   35 +
 groups/westmereEX/TLB_INSTR.txt                    |   27 +
 groups/westmereEX/UOPS.txt                         |   35 +
 kernel/Makefile                                    |    3 +-
 kernel/README                                      |    3 +
 make/config_checks.mk                              |   49 +
 make/config_defines.mk                             |  117 +
 make/include_CLANG.mk                              |   28 +
 make/include_GCC.mk                                |   15 +-
 make/include_GCCX86.mk                             |   22 +-
 make/include_ICC.mk                                |   14 +-
 make/include_MIC.mk                                |   22 +-
 monitoring/README.agent                            |   66 +
 monitoring/groups/atom/BW_MEM.txt                  |   10 +
 monitoring/groups/atom/FLOPS_DP.txt                |   13 +
 monitoring/groups/atom/FLOPS_SP.txt                |   12 +
 monitoring/groups/broadwell/BW.txt                 |   13 +
 monitoring/groups/broadwell/ENERGY.txt             |   18 +
 monitoring/groups/broadwell/FLOPS_DP.txt           |   22 +
 monitoring/groups/broadwell/FLOPS_SP.txt           |   22 +
 monitoring/groups/broadwellEP/BW.txt               |   13 +
 monitoring/groups/broadwellEP/ENERGY.txt           |   18 +
 monitoring/groups/core2/BW_L2.txt                  |   11 +
 monitoring/groups/core2/BW_MEM.txt                 |   10 +
 monitoring/groups/haswell/BW.txt                   |   13 +
 monitoring/groups/haswell/ENERGY.txt               |   18 +
 monitoring/groups/haswellEP/BW.txt                 |   32 +
 monitoring/groups/haswellEP/ENERGY.txt             |   18 +
 monitoring/groups/interlagos/BW.txt                |   16 +
 monitoring/groups/interlagos/CPI.txt               |   19 +
 monitoring/groups/interlagos/FLOPS.txt             |   18 +
 monitoring/groups/ivybridge/BW.txt                 |   13 +
 monitoring/groups/ivybridge/ENERGY.txt             |   18 +
 monitoring/groups/ivybridge/FLOPS_DP.txt           |   23 +
 monitoring/groups/ivybridge/FLOPS_SP.txt           |   24 +
 monitoring/groups/ivybridgeEP/BW.txt               |   32 +
 monitoring/groups/ivybridgeEP/ENERGY.txt           |   18 +
 monitoring/groups/ivybridgeEP/FLOPS_DP.txt         |   23 +
 monitoring/groups/ivybridgeEP/FLOPS_SP.txt         |   24 +
 monitoring/groups/kabini/BW.txt                    |   14 +
 monitoring/groups/kabini/CPI.txt                   |   19 +
 monitoring/groups/kabini/FLOPS.txt                 |   14 +
 monitoring/groups/nehalem/BW.txt                   |   20 +
 monitoring/groups/nehalem/CPI.txt                  |   14 +
 monitoring/groups/nehalem/FLOPS.txt                |   20 +
 monitoring/groups/nehalemEX/BW.txt                 |   29 +
 monitoring/groups/nehalemEX/CPI.txt                |   12 +
 monitoring/groups/nehalemEX/FLOPS.txt              |   20 +
 monitoring/groups/pentiumm/BW.txt                  |   12 +
 monitoring/groups/pentiumm/CPI.txt                 |   17 +
 monitoring/groups/phi/CPI.txt                      |   17 +
 monitoring/groups/sandybridge/BW.txt               |   13 +
 monitoring/groups/sandybridge/ENERGY.txt           |   18 +
 monitoring/groups/sandybridge/FLOPS_DP.txt         |   24 +
 monitoring/groups/sandybridge/FLOPS_SP.txt         |   24 +
 monitoring/groups/sandybridgeEP/BW.txt             |   24 +
 monitoring/groups/sandybridgeEP/ENERGY.txt         |   18 +
 monitoring/groups/sandybridgeEP/FLOPS_DP.txt       |   24 +
 monitoring/groups/sandybridgeEP/FLOPS_SP.txt       |   24 +
 monitoring/groups/silvermont/BW.txt                |   12 +
 monitoring/groups/silvermont/CPI.txt               |   14 +
 monitoring/groups/silvermont/ENERGY.txt            |   16 +
 monitoring/groups/westmere/BW.txt                  |   19 +
 monitoring/groups/westmere/CPI.txt                 |   14 +
 monitoring/groups/westmere/FLOPS.txt               |   20 +
 monitoring/groups/westmereEX/BW.txt                |   20 +
 monitoring/groups/westmereEX/CPI.txt               |   14 +
 monitoring/groups/westmereEX/FLOPS.txt             |   20 +
 monitoring/likwid-agent.conf                       |   52 +
 perl/AsmGen.pl                                     |  284 --
 perl/feedGnuplot                                   | 1543 ++++--
 perl/gas.pm                                        |  211 -
 perl/gen_events.pl                                 |   77 +-
 perl/generateGroups.pl                             |  142 -
 perl/generatePas.pl                                |  163 -
 perl/likwid-mpirun                                 |  456 --
 perl/likwid-perfscope                              |  110 -
 perl/likwid-setFrequencies                         |  185 -
 perl/set_license.pl                                |  226 +-
 perl/templates/group.tt                            |  208 -
 perl/templates/testcases.tt                        |   19 -
 src/access-daemon/Makefile                         |   20 +-
 src/access-daemon/accessDaemon.c                   |  908 ++--
 src/access-daemon/setFreq.c                        |  291 +-
 src/access.c                                       |  221 +
 src/accessClient.c                                 |  257 -
 src/access_client.c                                |  343 ++
 src/access_x86.c                                   |   91 +
 src/access_x86_msr.c                               |  288 ++
 src/access_x86_pci.c                               |  313 ++
 src/affinity.c                                     |  335 +-
 src/allocator.c                                    |  199 -
 src/applications/likwid-agent.lua                  |  559 +++
 src/applications/likwid-bench.c                    |  536 --
 src/applications/likwid-features.c                 |  191 -
 src/applications/likwid-features.lua               |  191 +
 src/applications/likwid-genCfg.c                   |  122 -
 src/applications/likwid-genTopoCfg.lua             |  153 +
 src/applications/likwid-memsweeper.c               |  138 -
 src/applications/likwid-memsweeper.lua             |   89 +
 src/applications/likwid-mpirun.lua                 | 1967 ++++++++
 src/applications/likwid-perfctr.c                  |  528 --
 src/applications/likwid-perfctr.lua                |  775 +++
 src/applications/likwid-perfscope.lua              |  560 +++
 src/applications/likwid-pin.c                      |  346 --
 src/applications/likwid-pin.lua                    |  275 ++
 src/applications/likwid-powermeter.c               |  507 --
 src/applications/likwid-powermeter.lua             |  388 ++
 src/applications/likwid-setFrequencies.lua         |  396 ++
 src/applications/likwid-topology.c                 |  509 --
 src/applications/likwid-topology.lua               |  394 ++
 src/applications/likwid.lua                        | 1142 +++++
 src/asciiBoxes.c                                   |  256 -
 src/asciiTable.c                                   |  236 -
 src/barrier.c                                      |  155 -
 src/bench.c                                        |  537 --
 src/bitUtil.c                                      |   14 +-
 src/bstrlib.c                                      | 3072 ++++++------
 src/calculator.c                                   |  926 ++++
 src/calculator_stack.c                             |   77 +
 src/configuration.c                                |  339 ++
 src/cpuFeatures.c                                  |  659 ++-
 src/cpuid.c                                        | 1244 -----
 src/cpustring.c                                    |  577 +++
 src/daemon.c                                       |  123 -
 src/ghash.c                                        |   52 +-
 src/hashTable.c                                    |   94 +-
 src/includes/access.h                              |   44 +
 src/includes/accessClient.h                        |   55 -
 src/includes/accessClient_types.h                  |   87 -
 src/includes/access_client.h                       |   11 +
 src/includes/access_client_types.h                 |   65 +
 src/includes/access_x86.h                          |   13 +
 src/includes/access_x86_msr.h                      |   12 +
 src/includes/access_x86_pci.h                      |   12 +
 src/includes/affinity.h                            |   24 +-
 src/includes/affinity_types.h                      |   42 -
 src/includes/allocator.h                           |   48 -
 src/includes/asciiBoxes.h                          |   42 -
 src/includes/asciiBoxes_types.h                    |   47 -
 src/includes/asciiTable.h                          |   45 -
 src/includes/asciiTable_types.h                    |   48 -
 src/includes/barrier.h                             |   62 -
 src/includes/barrier_types.h                       |   49 -
 src/includes/bitUtil.h                             |    8 +-
 src/includes/bstrlib.h                             |   46 +-
 src/includes/calculator.h                          |   38 +
 src/includes/calculator_stack.h                    |   48 +
 src/includes/configuration.h                       |   46 +
 src/includes/cpuFeatures.h                         |    8 +-
 src/includes/cpuFeatures_types.h                   |   42 +-
 src/includes/cpuid.h                               |  141 +-
 src/includes/cpuid_types.h                         |  115 -
 src/includes/daemon.h                              |   42 -
 src/includes/error.h                               |   70 +-
 src/includes/ghash.h                               |   42 +-
 src/includes/hashTable.h                           |   13 +-
 src/includes/libperfctr_types.h                    |   15 +-
 src/includes/likwid.h                              | 1389 +++++-
 src/includes/lock.h                                |    8 +-
 src/includes/memsweep.h                            |   15 +-
 src/includes/msr.h                                 |   47 -
 src/includes/multiplex.h                           |   40 -
 src/includes/multiplex_types.h                     |   42 -
 src/includes/numa.h                                |   43 +-
 src/includes/numa_hwloc.h                          |   40 +
 src/includes/numa_proc.h                           |   39 +
 src/includes/numa_types.h                          |   52 -
 src/includes/pci.h                                 |   49 -
 src/includes/pci_hwloc.h                           |   37 +
 src/includes/pci_proc.h                            |   37 +
 src/includes/pci_types.h                           |   69 +-
 src/includes/perfgroup.h                           |   94 +
 src/includes/perfmon.h                             |   88 +-
 src/includes/perfmon_atom.h                        |   11 +-
 src/includes/perfmon_atom_events.txt               |   17 +-
 src/includes/perfmon_broadwell.h                   | 1793 +++++++
 src/includes/perfmon_broadwellEP_counters.h        |  362 ++
 src/includes/perfmon_broadwellEP_events.txt        | 2569 ++++++++++
 src/includes/perfmon_broadwell_counters.h          |   83 +
 src/includes/perfmon_broadwell_events.txt          |  665 +++
 src/includes/perfmon_broadwelld_counters.h         |  252 +
 src/includes/perfmon_broadwelld_events.txt         | 1984 ++++++++
 src/includes/perfmon_core2.h                       |  341 +-
 src/includes/perfmon_core2_counters.h              |   31 +-
 src/includes/perfmon_core2_events.txt              |  217 +-
 src/includes/perfmon_haswell.h                     | 1973 +++++++-
 src/includes/perfmon_haswellEP_counters.h          |  330 ++
 src/includes/perfmon_haswellEP_events.txt          | 2616 ++++++++++
 src/includes/perfmon_haswell_counters.h            |   71 +-
 src/includes/perfmon_haswell_events.txt            |  651 ++-
 src/includes/perfmon_interlagos.h                  |  335 +-
 src/includes/perfmon_interlagos_counters.h         |   35 +-
 src/includes/perfmon_interlagos_events.txt         |  130 +-
 src/includes/perfmon_ivybridge.h                   | 1892 +++++---
 src/includes/perfmon_ivybridgeEP_counters.h        |  316 ++
 src/includes/perfmon_ivybridgeEP_events.txt        | 2072 ++++++++
 src/includes/perfmon_ivybridge_counters.h          |   90 +-
 src/includes/perfmon_ivybridge_events.txt          |  821 ++--
 src/includes/perfmon_k10.h                         |  231 +-
 src/includes/perfmon_k10_counters.h                |   26 +-
 src/includes/perfmon_k10_events.txt                |   53 +-
 src/includes/perfmon_k8.h                          |   17 +-
 src/includes/perfmon_k8_events.txt                 |   42 +-
 src/includes/perfmon_kabini.h                      |  398 +-
 src/includes/perfmon_kabini_counters.h             |   39 +-
 src/includes/perfmon_kabini_events.txt             |   30 +-
 src/includes/perfmon_nehalem.h                     |  698 ++-
 src/includes/perfmon_nehalemEX.h                   | 1828 ++++---
 src/includes/perfmon_nehalemEX_counters.h          |  185 +
 src/includes/perfmon_nehalemEX_events.txt          |  425 +-
 src/includes/perfmon_nehalemEX_westmereEX_common.h |   94 +
 src/includes/perfmon_nehalem_counters.h            |   58 +-
 src/includes/perfmon_nehalem_events.txt            |   33 +-
 src/includes/perfmon_p6_events.txt                 |   19 +-
 src/includes/perfmon_perf.h                        |   60 +
 src/includes/perfmon_phi.h                         |  241 +-
 src/includes/perfmon_phi_counters.h                |   23 +-
 src/includes/perfmon_phi_events.txt                |   17 +-
 src/includes/perfmon_pm.h                          |  249 +-
 src/includes/perfmon_pm_counters.h                 |   22 +-
 src/includes/perfmon_pm_events.txt                 |   36 +-
 src/includes/perfmon_sandybridge.h                 | 2129 ++++++--
 src/includes/perfmon_sandybridgeEP_counters.h      |  214 +
 src/includes/perfmon_sandybridgeEP_events.txt      | 1342 +++++
 src/includes/perfmon_sandybridge_counters.h        |   95 +-
 src/includes/perfmon_sandybridge_events.txt        |  652 +--
 src/includes/perfmon_silvermont.h                  |  527 +-
 src/includes/perfmon_silvermont_counters.h         |   37 +-
 src/includes/perfmon_silvermont_events.txt         |  424 +-
 src/includes/perfmon_skylake.h                     |  753 +++
 src/includes/perfmon_skylake_counters.h            |   84 +
 src/includes/perfmon_skylake_events.txt            |  599 +++
 src/includes/perfmon_types.h                       |  314 +-
 src/includes/perfmon_westmere.h                    |   13 +-
 src/includes/perfmon_westmereEX.h                  | 1943 +++++---
 src/includes/perfmon_westmereEX_counters.h         |  274 +-
 src/includes/perfmon_westmereEX_events.txt         |  405 +-
 src/includes/perfmon_westmere_events.txt           |  168 +-
 src/includes/power.h                               |  179 +-
 src/includes/power_types.h                         |   39 +-
 src/includes/registers.h                           |  554 ++-
 src/includes/registers_types.h                     |  209 +
 src/includes/strUtil.h                             |   55 -
 src/includes/strUtil_types.h                       |   61 -
 src/includes/test_types.h                          |  108 -
 src/includes/textcolor.h                           |    8 +-
 src/includes/thermal.h                             |   51 +-
 src/includes/thermal_types.h                       |   18 +-
 src/includes/threads.h                             |  107 -
 src/includes/threads_types.h                       |   57 -
 src/includes/timer.h                               |   76 +-
 src/includes/timer_types.h                         |    8 +-
 src/includes/tlb-info.h                            |   89 +
 src/includes/topology.h                            |  144 +
 src/includes/topology_cpuid.h                      |   43 +
 src/includes/topology_hwloc.h                      |   52 +
 src/includes/topology_proc.h                       |   51 +
 src/includes/topology_types.h                      |   73 +
 src/includes/tree.h                                |    9 +-
 src/includes/tree_types.h                          |   34 +-
 src/includes/types.h                               |   30 +-
 src/libperfctr.c                                   |  816 ++--
 src/likwid.f90                                     |  102 +-
 src/likwid_f90_interface.c                         |   57 +-
 src/loadData.S                                     |   44 +
 src/loadData.s                                     |   22 -
 src/loadData.s.tmp                                 |    0
 src/luawid.c                                       | 2334 +++++++++
 src/memsweep.c                                     |   59 +-
 src/msr.c                                          |  307 --
 src/multiplex.c                                    |  165 -
 src/numa.c                                         |  424 +-
 src/numa_hwloc.c                                   |  415 ++
 src/numa_proc.c                                    |  383 ++
 src/pci.c                                          |  398 --
 src/pci_hwloc.c                                    |   81 +
 src/pci_proc.c                                     |  125 +
 src/perfgroup.c                                    | 1285 +++++
 src/perfmon.c                                      | 3639 +++++++++-----
 src/perfmon_perf.c                                 |  260 +
 src/power.c                                        |  507 +-
 src/pthread-overload/Makefile                      |   25 +-
 src/pthread-overload/pthread-overload.c            |  108 +-
 src/strUtil.c                                      |  975 ----
 src/thermal.c                                      |   22 +-
 src/threads.c                                      |  217 -
 src/timer.c                                        |  399 +-
 src/topology.c                                     | 1041 ++++
 src/topology_cpuid.c                               |  939 ++++
 src/topology_hwloc.c                               |  327 ++
 src/topology_proc.c                                |  626 +++
 src/tree.c                                         |  116 +-
 test/MPI_pin_test.c                                |   53 +-
 test/Makefile                                      |   73 +-
 test/accuracy/Makefile                             |   39 +-
 test/accuracy/README                               |    7 +-
 test/accuracy/TESTS/BRANCH.txt                     |   42 +
 test/accuracy/TESTS/CLOCK.txt                      |   53 +
 test/accuracy/TESTS/DATA.txt                       |   34 +
 test/accuracy/TESTS/FLOPS_AVX.txt                  |   25 +-
 test/accuracy/TESTS/FLOPS_DP.txt                   |  105 +-
 test/accuracy/TESTS/FLOPS_SP.txt                   |   92 +-
 test/accuracy/TESTS/HA.txt                         |   58 +
 test/accuracy/TESTS/L2.txt                         |   62 +-
 test/accuracy/TESTS/L3.txt                         |   62 +-
 test/accuracy/TESTS/MEM.txt                        |   62 +-
 test/accuracy/TESTS/UOPS.txt                       |   30 +
 test/accuracy/likwid-accuracy.py                   |  540 +-
 test/accuracy/likwid-adjust-test-sizes.py          |  105 +
 test/accuracy/likwid-tester                        |  220 -
 test/accuracy/likwid-tester-plot                   |   78 -
 test/executable_tests/Makefile                     |   14 +-
 test/executable_tests/README                       |    3 +
 test/executable_tests/likwid-bench.txt             |   39 +-
 test/executable_tests/likwid-features.txt          |    9 -
 test/executable_tests/likwid-genCfg.txt            |    5 -
 test/executable_tests/likwid-genTopoCfg.txt        |    5 +
 test/executable_tests/likwid-memsweeper.txt        |    8 +-
 test/executable_tests/likwid-mpirun.txt            |   39 +
 test/executable_tests/likwid-perfctr.txt           |   73 +-
 test/executable_tests/likwid-pin.txt               |   12 +-
 test/executable_tests/likwid-powermeter.txt        |   28 +-
 test/executable_tests/likwid-setFreq.txt           |    6 -
 test/executable_tests/likwid-setFrequencies.txt    |   14 +
 test/executable_tests/likwid-topology.txt          |    9 +-
 test/executable_tests/tester.sh                    |   24 +-
 test/serial.c                                      |   43 +
 test/stream.c                                      |  423 --
 test/test-likwidAPI.c                              | 2099 ++++++++
 test/test-msr-access.c                             |  101 +
 test/testTBB.cc                                    |   67 +
 1058 files changed, 129398 insertions(+), 27430 deletions(-)

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..3877f0e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,15 @@
+language: c
+compiler: gcc
+install: true
+dist: trusty
+sudo: required
+script:
+  - make && sudo make install
+  - /usr/local/bin/likwid-topology -c -C -g
+  - /usr/local/bin/likwid-pin -p
+  - sudo modprobe msr
+  - ls -la /dev/cpu/*
+  - ls -la /usr/local/sbin/*
+  - make -C test streamGCC
+  - /usr/local/bin/likwid-perfctr -i
+  - /usr/local/bin/likwid-bench -t copy -w N:100MB:2
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..6c84df0
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,59 @@
+
+
+# Changelog 4.1.0
+- Support for Intel Skylake (Core + Uncore)
+- Support for Intel Broadwell (Core + Uncore)
+- Support for Intel Broadwell D (Core + Uncore)
+- Support for Intel Broadwell EP/EN/EX (Core + Uncore)
+- Support for Intel Airmont (Core)
+- Uncore support for Intel SandyBridge, IvyBridge and Haswell
+- Performance group and event set handling in library
+- Internal calculator for derived metrics
+- Improvement of Marker API
+- Get results/metrics of last measurement cycle
+- Fixed most memory leaks
+- Respect 'Intel PMU sharing guide'
+- Update of internal Lua to 5.3
+- More examples (C++11 threads,Cilk+, TBB)
+- Test suite for executables and library
+- Accuracy checker supports multiple CPUs
+- Security checked access daemon
+- Likwid-bench supports Integer benchmarks
+- Likwid-bench selects interation count automatically
+- Likwid-bench has new FMA related benchmarks
+- Likwid-mpirun supports SLURM job scheduler
+- New tool likwid-features
+
+# Changelog 4.0.1
+- likwid-bench: Iteration determination is done serially
+- likwid-bench: Manual selection of iterations possible
+- likwid-perfctr: Set cpuset to all CPUs not only the first
+- likwid-pin: Set cpuset to all CPUs not only the first
+- likwid-accuracy.py: Enhanced plotting functions, use only instrumented likwid-bench
+- likwid-accessD: Check for allowed register for PCI accesses
+- Add models HASWELL_M1 (0x45) and HASWELL_M2 (0x46) to likwid-powermeter and likwid-accessD
+- New test application using Cilk and Marker API
+- New test application using C++11 threads and Marker API
+- likwid-agent: gmetric version check for --group option and s/\s*/_/ in metric names
+- likwid-powermeter: Print RAPL domain name
+- Marker API: Initialize access already at likwid_markerInit()
+- Marker API: likwid_markerThreadInit() only pins if not already pinned
+
+# Changelog 4.0.0
+
+- Support for Intel Broadwell
+- Uncore support for all Uncore-aware architectures
+    - Nehalem (EX)
+    - Westmere (EX)
+    - SandyBridge EP
+    - IvyBridge EP
+    - Haswell EP
+- Measure multiple event sets in a round-robin fashion (no multiplexing!)
+- Event options to filter the counter increments
+- Whole LIKWID functionality is exposed as API for C/C++ and Lua
+- New functions in the Marker API to switch event sets and get intermediate results
+- Topology code relies on hwloc. CPUID is still included but only as fallback
+- Most LIKWID applications are written in Lua (only exception likwid-bench)
+- Monitoring daemon likwid-agent with multiple output backends
+- More performance groups
+
diff --git a/INSTALL b/INSTALL
index 5939aa9..c4bfb05 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,106 +1,140 @@
 == Basic build ==
 
 1. Edit config.mk. Follow the comments there.
-   Optionally you can change compiler settings in include_[GCC|ICC|GCCX86].mk.
-   Please note that only the default compiler flags are supported and tested.
-2. make
-3. make install (required)
-4. setup access to the msr device files (see end of this document)
+   Optionally you can change compiler settings in include_[GCC|CLANG|ICC|MIC].mk.
+   Please note that only the default compiler flags GCC are supported and tested.
+2. make (Builds hwloc, lua, Likwid libraries, access daemons and likwid-bench)
+3. make install (this is required for likwid-pin and if you use the accessDaemon)
 
 Only the default flags set are tested. As it is not possible to test all
-compiler setting variants the Intel icc compiler is only build tested. A basic
-function test is done for the icc binary. The only variant fully tested is gcc
-with default compiler flags. It is therefore recommended to use gcc with the
-default flags. If you want to use and build the Fortran interface you can mix
-GCC with the Intel Fortran Compiler. More information on this can be found in
-the WIKI. On 32bit systems you have to pick the GCCX86 compiler target.
+compiler setting variants the Intel icc compiler and Clang is only build tested.
+A basic function test is done for the icc binary. The only variant fully tested
+is gcc with default compiler flags. It is therefore recommended to use gcc with
+the default flags. If you want to use and build the Fortran interface you can mix
+GCC with the Intel Fortran Compiler (default setup). You can change the Fortran
+compiler in make/include_[GCC|CLANG|ICC|MIC].mk.
 
 *NOTICE*
 
-All generated files are located in the [GCC|ICC|GCCX86] build directory.
-This includes the dependency files, object files and also the
-generated source files and the pas and assembly files for likwid-bench.
+All generated files are located in the [GCC|ICC|CLANG|MIC] build directory.
+This includes the dependency files, object files. The
+generated source files and the pas and assembly files for likwid-bench are build
+in bench/[GCC|ICC|CLANG|MIC].
 If you debug your likwid-bench benchmarks you can look at all
 intermediate build files and also the final assembly code.
 
+== Build on Xeon Phi ==
+For builds for the Xeon Phi coprocessor, the accessDaemon and the frequency
+daemon are disabled. Moreover, the access mode is set to 'direct'. This was made
+because it is important to run as few processes as possible on the Xeon Phi and
+the accessDaemon would start one process per hardware thread.
+In order to build Likwid for the Xeon Phi processor, you have to change the
+RPATHS variable in make/include_MIC.mk to point to the folder with the Intel
+libraries like libimf.so. This is crucial because when using an suid-root
+executable, the LD_LIBRARY_PATH gets lost but Likwid still needs to know where
+the Intel libraries reside.
+After installation change the owner of likwid-lua to root and set the suid-root
+bit for likwid-lua:
+chown root <BINPATH>/likwid-lua
+chmod u+s <BINPATH>/likwid-lua
+Afterwards Likwid can be used as anywhere else.
+
 == Known problems ==
 
 On very old systems with old kernels (< 2.6.7) or old glibc versions likwid
 is build with reduced funtionality. This includes missing support for NUMA
 and pinning.
+likwid-setFrequencies can only be used if the acpi_cpufreq module is loaded. It
+is not possible to fix the frequency with the intel_pstate module.
 
 == Additional Targets ==
 
 make clean     -  clean the object directory
 make distclean -  clean also the executables/libraries
 make uninstall -  delete installed files
+make docs      -  generate html documentation using doxygen
+make local     -  set paths in Lua files to work from current directory
+                  (for testing only! Uses already installed access daemons and
+                  libraries. Often you have to set the LD_LIBRARY_PATH to the
+                  contain the current folder)
 
-== Build  accessDaemon ==
+== Dependencies ==
+Most parts of the Likwid suite do not have external dependencies that need to be
+installed before you can build Likwid. If external libraries are used, they are
+shipped with Likwid.
 
-To build the accessDaemon:
+Included dependencies:
+- hwloc
+- Lua
+- Perl Template toolkit
 
-1. Set the desired default ACCESSMODE. You can overwrite this on the command line.
-2. make will also build the accessDaemon
-3. Install with
-   make install
+Build dependencies:
+- C compiler (commonly gcc, but clang and icc are also possible)
+- make
+- Perl
 
-With the standard  make install target the daemon will also be installed in
-${PREFIX}/sbin . Don't forget to copy the dameon if you configured a different
-path in ACCESSDAEMON.
+Runtime dependencies for likwid-perfscope:
+- gnuplot
 
-== Setup of msr module ==
+Runtime dependencies for likwid-agent (if enabled in configfile):
+- gmetric (Output to Ganglia Monitoring System)
+- rrdtool (Output to RRDs)
+- logger (Output to syslog)
 
-likwid-perfctr, likwid-powermeter and likwid-features require the Linux msr kernel module. This module
-is part of most standard distro kernels. You have to be root to do the initial setup.
+For the HTML documentation you further need doxygen.
 
-Check if msr device files are there with 'ls /dev/cpu/0/'. If msr device files are not there try:
+== Build  accessDaemon ==
 
-1. Check if the msr module is loaded with  'lsmod | grep msr' . There should be an output.
-2. It the module is not loaded load it with  'modprobe msr' . For automatic loading at startup
-consult your distros documentation how to do so.
+Change path for the accessDaemon:
 
-Once you have the msr device files avilable:
-3. Adopt access rights on the msr device files for normal user. To allow everybody access you can
-use 'chmod o+rw /dev/cpu/*/msr' . This is only recommended on save single user desktop systems.
+1. Edit config.mk and configure path in ACCESSDAEMON variable. You can overwrite
+   it later in likwid.cfg
+2. Set the desired default ACCESSMODE. You can overwrite this on the command
+   line or likwid.cfg.
+2. make will also build the accessDaemon
+3. Install with (sudo) make install
+
+With the standard make install target the daemon will also be installed in
+to the path in $ACCESSDAEMON. It also sets the user to root and the suid bit.
+
+== Setup of msr module ==
+
+likwid-perfctr, likwid-powermeter, likwid-agent, require the Linux msr kernel
+module. This module is part of most standard distro kernels. You have to be root
+to do the initial setup.
+
+1. Check if the msr module is loaded with 'lsmod | grep msr'.
+   There should be an output.
+2. If the module is not loaded, load it with 'modprobe msr'. For automatic
+   loading at startup consult your distros documentation how to do so, commonly
+   by adding 'msr' to /etc/modules.
+3. Adopt access rights on the msr device files for normal user. To allow
+   everybody access you can use 'chmod o+rw /dev/cpu/*/msr'.
+   This is only recommended on save single user desktop systems and might be not
+   enough to grant access to anybody because of POSIX capabilites or other
+   security features of your distro.
 
 As a general access to the msr registers is not desired on security sensitive
-systems you can either implement a more sophisticated access rights settings
+systems, you can either implement a more sophisticated access rights settings
 with e.g. setgid. A common solution used on many other device files, e.g. for
 audio, is to introduce a group and make a chown on the msr device files to that
-group. Now if you execute likwid-perfctr with setgid on that group the
-executing user can use the tool but cannot directly write or read the msr
-device files.
+group or use dbus rules. Now if you execute likwid-perfctr with setgid on that
+group the executing user can use the tool but cannot directly write or read the
+msr device files.
 
 A secure solution is to use the accessDaemon, which encapsulates the access to
-the msr device files and performs a address check for allowed registers. For
-more information how to setup and use this solution have a look at the WIKI
-page:
-
-http://code.google.com/p/likwid/wiki/MSRDaemon
+the msr device files and performs an address check for allowed registers. For
+more information how to setup look at the HTML documentation.
 
-A common solution to give access is to use the likwid-accessD and make it suid root.
-Starting with version 3.1.3 make install will do those steps. Of course this will only
-work as long as you are root while calling make install.
-
-If for you are not root and someone else needs to install the daemon the
-following steps need to be carried out:
-
-1. Go to the directory where you installed the likwid tools.
-2. Change to the sbin directory there.
-3. Execute (as root): chown root.<some user group>  likwid-accessD
-4. Execute (as root): chmod u+s likwid-accessD
-
-
-This should be sufficient on many machines.
-You need to perform the same procedure for likwid-setFreq.
-
-=== THIS IS USUALLY NOT NECESSARY ANYMORE ==
 A demo for a root exploit involving the msr device files was published. As
 a consequence the security settings for access to the msr device files are
-tightened in recent kernels.
+tightened in recent kernels. The exploit used a specify register to alter the
+entry point for the current process to a malware. The daemon grants access only
+to hardware performance counter related registers.
+
 Just setting the file access rights or using suid root on the access daemon is
-not sufficient anymore. You have to register your binary now to get access.
-This is only necessary if above setup dos not work.
+not sufficient anymore for some distros. You have to register your binary at the
+libcap now to get access. This is only necessary if above setup does not work.
 
 You register the necessary capability by calling
 
@@ -108,27 +142,10 @@ sudo setcap cap_sys_rawio+ep EXECUTABLE
 
 on the executables. This is only possible on local file systems.
 The only feasable way is to register the likwid-accessD and proxy all access over it.
-=== SNIP ==
 
 If you have still problems please let me know on the likwid mailing list:
-
 http://groups.google.com/group/likwid-users
 
-== NOTICE for Intel Xeon Phi (KNC) ==
-
-If you want to use LIKWID on a Xeon Phi you have to use set MIC as COMPILER in
-config.mk. This build of LIKWID won't be binary compatible with other X86
-processors. It is required to set the default access mode to direct in
-and disable the build of likwid-accessD in config.mk.
-
-To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
-RDTSC being used for wallclock time. On the MIC this is only given if power
-management is turned off. This can be configured in
-/etc/sysconfig/mic/default.conf.
-
-At the end of this file the power management is configured. The following configuration worked:
-
-    PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"
 
 
 
diff --git a/Makefile b/Makefile
index eecd4e9..e1f959b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,17 @@
+#
 # =======================================================================================
 #
 #      Filename:  Makefile
 #
 #      Description:  Central Makefile
 #
-#      Version:   3.1.3
-#      Released:  4.11.2014
+#      Version:   <VERSION>
+#      Released:  <DATE>
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2013 Jan Treibig
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -32,25 +33,8 @@ GROUP_DIR   = ./groups
 FILTER_DIR  = ./filters
 MAKE_DIR    = ./make
 
-#DO NOT EDIT BELOW
-
-# determine kernel Version
-KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
-KERNEL_VERSION := $(shell uname -r | awk  '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
-KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
-
-HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 7 ]; then \
-               echo 0;  else echo 1; \
-			   fi; )
 
-HAS_RDTSCP = $(shell  /bin/bash -c "cat /proc/cpuinfo | grep -c rdtscp")
-
-# determine glibc Version
-GLIBC_VERSION := $(shell ldd --version | grep ldd |  awk '{ print $$NF }' | awk -F. '{ print $$2 }')
-
-HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
-               echo 0;  else echo 1; \
-			   fi; )
+#DO NOT EDIT BELOW
 
 # Dependency chains:
 # *.[ch] -> *.o -> executables
@@ -59,161 +43,105 @@ HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
 
 include ./config.mk
 include $(MAKE_DIR)/include_$(COMPILER).mk
-INCLUDES  += -I./src/includes  -I$(BUILD_DIR)
-LIBS      +=
-DEFINES   += -DVERSION=$(VERSION)         \
-		 -DRELEASE=$(RELEASE)                 \
-		 -DCFGFILE=$(CFG_FILE_PATH)           \
-		 -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
-		 -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
-		 -DHASH_TABLE_SIZE=$(HASH_TABLE_SIZE) \
-		 -DLIBLIKWIDPIN=$(LIBLIKWIDPIN)       \
-		 -DLIKWIDFILTERPATH=$(LIKWIDFILTERPATH)
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+INCLUDES  += -I./src/includes -I$(LUA_FOLDER)/includes -I$(HWLOC_FOLDER)/include -I$(BUILD_DIR)
+LIBS      += -ldl
 
 #CONFIGURE BUILD SYSTEM
 BUILD_DIR  = ./$(COMPILER)
 Q         ?= @
 GENGROUPLOCK = .gengroup
 
-ifeq ($(COMPILER),MIC)
-BENCH_DIR   = ./bench/phi
-else
-ifeq ($(COMPILER),GCCX86)
-BENCH_DIR   = ./bench/x86
-else
-BENCH_DIR   = ./bench/x86-64
-endif
-endif
-
-LIKWID_LIB = liblikwid
-ifeq ($(SHARED_LIBRARY),true)
-CFLAGS += $(SHARED_CFLAGS) -ggdb
-DYNAMIC_TARGET_LIB := $(LIKWID_LIB).so
-TARGET_LIB := $(DYNAMIC_TARGET_LIB)
-LIBS += -L. -llikwid
-SHARED_LFLAGS += -lm -lpthread
-else
-STATIC_TARGET_LIB := $(LIKWID_LIB).a
-TARGET_LIB := $(STATIC_TARGET_LIB)
-endif
-
-ifneq ($(COLOR),NONE)
-DEFINES += -DCOLOR=$(COLOR)
-endif
-
-ifneq ($(COMPILER),MIC)
-    DAEMON_TARGET = likwid-accessD
-else
-    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
-endif
-
-ifeq ($(INSTRUMENT_BENCH),true)
-DEFINES += -DPERFMON
-endif
-
-ifeq ($(HAS_MEMPOLICY),1)
-DEFINES += -DHAS_MEMPOLICY
-else
-$(info Kernel $(KERNEL_VERSION_MAJOR).$(KERNEL_VERSION).$(KERNEL_VERSION_MINOR) has no mempolicy support! First Linux kernel with memory policies has version 2.6.7);
-endif
-
-ifeq ($(HAS_RDTSCP),0)
-$(info Building without RDTSCP timing support!);
-else
-ifneq ($(COMPILER),MIC)
-DEFINES += -DHAS_RDTSCP
-else
-    $(info Info: Compiling for Xeon Phi. Disabling RDTSCP support.);
-endif
-endif
-
-ifeq ($(HAS_SCHEDAFFINITY),1)
-DEFINES += -DHAS_SCHEDAFFINITY
-PINLIB  = liblikwidpin.so
-else
-$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
-PINLIB  =
-endif
-
-DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
-
-ifeq ($(ACCESSMODE),accessdaemon)
-ifneq ($(COMPILER),MIC)
-    DEFINES += -DACCESSMODE=1
-else
-    $(info Info: Compiling for Xeon Phi. Set accessmode to direct.);
-    DEFINES += -DACCESSMODE=0
-endif
-else
-    DEFINES += -DACCESSMODE=0
-endif
-
-SETFREQ_TARGET = likwid-setFreq
-
 VPATH     = $(SRC_DIR)
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
-OBJ      += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
 OBJ      += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
+OBJ      += $(patsubst $(SRC_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.S))
+ifeq ($(FILTER_HWLOC_OBJ),yes)
+OBJ := $(filter-out $(BUILD_DIR)/topology_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/numa_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/pci_hwloc.o,$(OBJ))
+endif
+ifneq ($(FORTRAN_INTERFACE),true)
+OBJ := $(filter-out $(BUILD_DIR)/likwid_f90_interface.o,$(OBJ))
+endif
 PERFMONHEADERS  = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildcard $(SRC_DIR)/includes/*.txt))
-OBJ_BENCH  =  $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
-
-APPS      = likwid-perfctr    \
-            likwid-features   \
-            likwid-powermeter \
-            likwid-memsweeper \
-            likwid-topology   \
-            likwid-genCfg     \
-            likwid-pin        \
-            likwid-bench
-
-PERL_APPS = likwid-mpirun         \
-            likwid-setFrequencies \
-            likwid-perfscope
-
-DAEMON_APPS = $(SETFREQ_TARGET) \
-			$(DAEMON_TARGET)
+OBJ_LUA    =  $(wildcard ./ext/lua/$(COMPILER)/*.o)
+OBJ_HWLOC  =  $(wildcard ./ext/hwloc/$(COMPILER)/*.o)
+
+
+L_APPS      =   likwid-perfctr \
+				likwid-pin \
+				likwid-powermeter \
+				likwid-topology \
+				likwid-memsweeper \
+				likwid-agent \
+				likwid-mpirun \
+				likwid-features \
+				likwid-perfscope \
+				likwid-genTopoCfg
+C_APPS      =   bench/likwid-bench
+L_HELPER    =   likwid.lua
+ifeq ($(BUILDFREQ),true)
+	L_APPS += likwid-setFrequencies
+endif
 
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
 
-ifneq ($(FORTRAN_INTERFACE),false)
-HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
-ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
-FORTRAN_INTERFACE=
-$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler - not compiling it!)
-else
-FORTRAN_INTERFACE = likwid.mod
-FORTRAN_INSTALL =  @cp -f likwid.mod  $(PREFIX)/include/
-endif
-else
-FORTRAN_INTERFACE =
-FORTRAN_INSTALL =
-endif
-
-all: $(BUILD_DIR) $(GENGROUPLOCK) $(PERFMONHEADERS) $(OBJ) $(OBJ_BENCH) $(TARGET_LIB) $(APPS) $(FORTRAN_INTERFACE)  $(PINLIB)  $(DAEMON_TARGET) $(SETFREQ_TARGET)
+all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET)
 
 tags:
 	@echo "===>  GENERATE  TAGS"
 	$(Q)ctags -R
 
-$(APPS):  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .c,$(APPS))) $(BUILD_DIR) $(GENGROUPLOCK)  $(OBJ) $(OBJ_BENCH)
-	@echo "===>  LINKING  $@"
-	$(Q)${CC} $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS} -o $@  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .c,$@)) $(OBJ_BENCH) $(STATIC_TARGET_LIB) $(LIBS)
-
-$(STATIC_TARGET_LIB): $(OBJ)
-	@echo "===>  CREATE STATIC LIB  $(STATIC_TARGET_LIB)"
-	$(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ)
-
-$(DYNAMIC_TARGET_LIB): $(OBJ)
-	@echo "===>  CREATE SHARED LIB  $(DYNAMIC_TARGET_LIB)"
-	$(Q)${CC} $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) -lm $(SHARED_LFLAGS)
+docs:
+	@echo "===>  GENERATE DOXYGEN DOCS"
+	@cp doc/lua-doxygen.md doc/lua-doxygen.md.safe
+	@cp doc/likwid-doxygen.md doc/likwid-doxygen.md.safe
+	@sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/lua-doxygen.md
+	@sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/likwid-doxygen.md
+	$(Q)doxygen doc/Doxyfile
+	@mv doc/lua-doxygen.md.safe doc/lua-doxygen.md
+	@mv doc/likwid-doxygen.md.safe doc/likwid-doxygen.md
+
+$(L_APPS):  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .lua,$(L_APPS)))
+	@echo "===>  ADJUSTING  $@"
+	@if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 1"/"access_mode = 0"/g $(SRC_DIR)/applications/$@.lua;fi
+	@sed -e s/'<INSTALLED_BINPREFIX>'/$(subst /,\\/,$(INSTALLED_BINPREFIX))/g \
+		-e s/'<INSTALLED_PREFIX>'/$(subst /,\\/,$(INSTALLED_PREFIX))/g \
+		-e s/'<VERSION>'/$(VERSION).$(RELEASE)/g \
+		-e s/'<DATE>'/$(DATE)/g \
+		$(addprefix $(SRC_DIR)/applications/,$(addsuffix  .lua,$@)) > $@
+	@if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 0"/"access_mode = 1"/g $(SRC_DIR)/applications/$@.lua;fi
+
+$(L_HELPER):
+	@echo "===>  ADJUSTING  $@"
+	@sed -e s/'<PREFIX>'/$(subst /,\\/,$(PREFIX))/g \
+		-e s/'<INSTALLED_LIBPREFIX>'/$(subst /,\\/,$(INSTALLED_LIBPREFIX))/g \
+		-e s/'<INSTALLED_PREFIX>'/$(subst /,\\/,$(INSTALLED_PREFIX))/g \
+		-e s/'<LIKWIDGROUPPATH>'/$(subst /,\\/,$(LIKWIDGROUPPATH))/g \
+		-e s/'<LIBLIKWIDPIN>'/$(subst /,\\/,$(LIBLIKWIDPIN))/g \
+		-e s/'<VERSION>'/$(VERSION)/g \
+		-e s/'<RELEASE>'/$(RELEASE)/g \
+		$(SRC_DIR)/applications/$@ > $@
+
+$(STATIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+	@echo "===>  CREATE STATIC LIB  $(TARGET_LIB)"
+	$(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+
+
+$(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB)
+	@echo "===>  CREATE SHARED LIB  $(TARGET_LIB)"
+	$(Q)${CC} $(DEBUG_FLAGS) $(SHARED_LFLAGS) -Wl,-soname,$(TARGET_LIB).$(VERSION).$(RELEASE) $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) $(LIBS) $(TARGET_HWLOC_LIB) $(TARGET_LUA_LIB) $(RPATHS)
 
 $(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
-	@echo "===>  Build access daemon $(DAEMON_TARGET)"
-	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon $(DAEMON_TARGET)
+	@echo "===>  BUILD access daemon likwid-accessD"
+	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon likwid-accessD
 
-$(SETFREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
-	@echo "===>  Build frequency daemon $(SETFREQ_TARGET)"
-	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon $(SETFREQ_TARGET)
+$(FREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
+	@echo "===>  BUILD frequency daemon likwid-setFreq"
+	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon likwid-setFreq
 
 $(BUILD_DIR):
 	@mkdir $(BUILD_DIR)
@@ -227,135 +155,435 @@ $(GENGROUPLOCK): $(foreach directory,$(shell ls $(GROUP_DIR)), $(wildcard $(GROU
 	$(Q)$(GEN_GROUPS) ./groups  $(BUILD_DIR) ./perl/templates
 	$(Q)touch $(GENGROUPLOCK)
 
-$(FORTRAN_INTERFACE): $(SRC_DIR)/likwid.f90
+$(FORTRAN_IF): $(SRC_DIR)/likwid.f90
 	@echo "===>  COMPILE FORTRAN INTERFACE  $@"
 	$(Q)$(FC) -c  $(FCFLAGS) $<
 	@rm -f likwid.o
 
+$(TARGET_LUA_LIB):
+	@echo "===>  ENTER  $(LUA_FOLDER)"
+	$(Q)$(MAKE) -s --no-print-directory -C $(LUA_FOLDER) $(MAKECMDGOALS)
+
+$(TARGET_HWLOC_LIB):
+	@echo "===>  ENTER  $(HWLOC_FOLDER)"
+	$(Q)$(MAKE) -s --no-print-directory -C $(HWLOC_FOLDER) $(MAKECMDGOALS)
+
+$(BENCH_TARGET):
+	@echo "===>  ENTER  $(BENCH_FOLDER)"
+	$(Q)$(MAKE) -s --no-print-directory -C $(BENCH_FOLDER) $(MAKECMDGOALS)
+
 #PATTERN RULES
 $(BUILD_DIR)/%.o:  %.c
 	@echo "===>  COMPILE  $@"
-	$(Q)$(CC) -c  $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
-	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
-
-$(BUILD_DIR)/%.o:  %.s
-	@echo "===>  ASSEMBLE  $@"
-	$(Q)$(AS) $(ASFLAGS)  $< -o $@
+	$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
 $(BUILD_DIR)/%.o:  %.cc
 	@echo "===>  COMPILE  $@"
-	$(Q)$(CXX) -c  $(CXXFLAGS) $(CPPFLAGS) $< -o $@
-	$(Q)$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+	$(Q)$(CXX) -c $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CXX) $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
+$(BUILD_DIR)/%.o:  %.S
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CPP) $(CPPFLAGS) $< -o $@.tmp
+	$(Q)$(AS) $(ASFLAGS) $@.tmp -o $@
+	@rm $@.tmp
 
-$(BUILD_DIR)/%.pas:  $(BENCH_DIR)/%.ptt
-	@echo "===>  GENERATE BENCHMARKS"
-	$(Q)$(GEN_PAS)  $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
 
 $(BUILD_DIR)/%.h:  $(SRC_DIR)/includes/%.txt
 	@echo "===>  GENERATE HEADER $@"
 	$(Q)$(GEN_PMHEADER) $< $@
 
-$(BUILD_DIR)/%.o:  $(BUILD_DIR)/%.pas
-	@echo "===>  ASSEMBLE  $@"
-	$(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $<  '$(DEFINES)'
-	$(Q)$(AS) $(ASFLAGS)  $(BUILD_DIR)/$*.s -o $@
 
 ifeq ($(findstring $(MAKECMDGOALS),clean),)
 -include $(OBJ:.o=.d)
 endif
 
-.PHONY: clean distclean install uninstall
+.PHONY: clean distclean install uninstall help $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
+
 
 .PRECIOUS: $(BUILD_DIR)/%.pas
 
 .NOTPARALLEL:
 
 
-clean:
+clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
 	@echo "===>  CLEAN"
-	@rm -rf $(BUILD_DIR)
-	@rm -f $(GENGROUPLOCK)
+	@for APP in $(L_APPS); do \
+		rm -f $$APP; \
+	done
+	@rm -f likwid.lua
+	@rm -f $(STATIC_TARGET_LIB)
+	@rm -f $(DYNAMIC_TARGET_LIB)
+	@rm -f $(PINLIB)
+	@rm -f $(FORTRAN_IF_NAME)
+	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
 
-distclean: clean
+distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
 	@echo "===>  DIST CLEAN"
-	@rm -f likwid-*
-	@rm -f $(LIKWID_LIB)*
-	@rm -f $(FORTRAN_INTERFACE)
+	@for APP in $(L_APPS); do \
+		rm -f $$APP; \
+	done
+	@rm -f likwid.lua
+	@rm -f $(STATIC_TARGET_LIB)
+	@rm -f $(DYNAMIC_TARGET_LIB)
 	@rm -f $(PINLIB)
+	@rm -f $(FORTRAN_IF_NAME)
+	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
+	@rm -rf $(BUILD_DIR)
+	@rm -f $(GENGROUPLOCK)
+	@rm -rf doc/html
 	@rm -f tags
 
-install:
-	@echo "===> INSTALL applications to $(PREFIX)/bin"
-	@mkdir -p $(PREFIX)/bin
-	@for app in $(APPS); do \
-		cp -f $$app $(PREFIX)/bin; \
-	done
-	@cp -f perl/feedGnuplot  $(PREFIX)/bin
-	@for app in $(PERL_APPS); do \
-		sed -e "s+<PREFIX>+$(PREFIX)+g" perl/$$app > $(PREFIX)/bin/$$app; \
-	done
-	@chmod 755 $(PREFIX)/bin/likwid-*
-	@echo "===> INSTALL daemon applications to $(PREFIX)/sbin"
+ifeq ($(BUILDDAEMON),true)
+ifneq ($(COMPILER),MIC)
+install_daemon:
+	@echo "===> INSTALL access daemon to $(ACCESSDAEMON)"
+	@mkdir -p `dirname $(ACCESSDAEMON)`
+	@install -m 4775 $(INSTALL_CHOWN) $(DAEMON_TARGET) $(ACCESSDAEMON)
+move_daemon:
+	@echo "===> MOVE access daemon from $(ACCESSDAEMON) to $(INSTALLED_ACCESSDAEMON)"
+	@mkdir -p `dirname $(INSTALLED_ACCESSDAEMON)`
+	@install -m 4775 $(INSTALL_CHOWN) $(ACCESSDAEMON) $(INSTALLED_ACCESSDAEMON)
+uninstall_daemon:
+	@echo "===> REMOVING access daemon from $(ACCESSDAEMON)"
+	@rm -f $(ACCESSDAEMON)
+uninstall_daemon_moved:
+	@echo "===> REMOVING access daemon from $(INSTALLED_ACCESSDAEMON)"
+	@rm -f $(INSTALLED_ACCESSDAEMON)
+else
+install_daemon:
+	@echo "===> No INSTALL of the access daemon"
+move_daemon:
+	@echo "===> No MOVE of the access daemon"
+uninstall_daemon:
+	@echo "===> No UNINSTALL of the access daemon"
+uninstall_daemon_moved:
+	@echo "===> No UNINSTALL of the access daemon"
+endif
+else
+install_daemon:
+	@echo "===> No INSTALL of the access daemon"
+move_daemon:
+	@echo "===> No MOVE of the access daemon"
+uninstall_daemon:
+	@echo "===> No UNINSTALL of the access daemon"
+uninstall_daemon_moved:
+	@echo "===> No UNINSTALL of the access daemon"
+endif
+
+ifeq ($(BUILDFREQ),true)
+ifneq ($(COMPILER),MIC)
+install_freq:
+	@echo "===> INSTALL setFrequencies tool to $(PREFIX)/sbin/$(FREQ_TARGET)"
 	@mkdir -p $(PREFIX)/sbin
-	@for app in $(DAEMON_APPS); do \
-		cp -f $$app $(PREFIX)/sbin; \
-		if [ $(shell id -u) = "0" ]; then \
-			chown root $(PREFIX)/sbin/$$app; \
-			chmod 4775 $(PREFIX)/sbin/$$app; \
-		else \
-			echo "Only root can adjust the privileges of the daemon applications in $(PREFIX)/sbin"; \
-		fi; \
+	@install -m 4775 $(INSTALL_CHOWN) $(FREQ_TARGET) $(PREFIX)/sbin/$(FREQ_TARGET)
+move_freq:
+	@echo "===> MOVE setFrequencies tool from $(PREFIX)/sbin/$(FREQ_TARGET) to $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)"
+	@mkdir -p $(INSTALLED_PREFIX)/sbin
+	@install -m 4775 $(INSTALL_CHOWN) $(PREFIX)/sbin/$(FREQ_TARGET) $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)
+uninstall_freq:
+	@echo "===> REMOVING setFrequencies tool from $(PREFIX)/sbin/$(FREQ_TARGET)"
+	@rm -f $(PREFIX)/sbin/$(FREQ_TARGET)
+uninstall_freq_moved:
+	@echo "===> REMOVING setFrequencies tool from $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)"
+	@rm -f $(INSTALLED_PREFIX)/sbin/$(FREQ_TARGET)
+else
+install_freq:
+	@echo "===> No INSTALL of setFrequencies tool"
+move_freq:
+	@echo "===> No MOVE of setFrequencies tool"
+uninstall_freq:
+	@echo "===> No UNINSTALL of setFrequencies tool"
+uninstall_freq_moved:
+	@echo "===> No UNINSTALL of setFrequencies tool"
+endif
+else
+install_freq:
+	@echo "===> No INSTALL of setFrequencies tool"
+move_freq:
+	@echo "===> No MOVE of setFrequencies tool"
+uninstall_freq:
+	@echo "===> No UNINSTALL of setFrequencies tool"
+uninstall_freq_moved:
+	@echo "===> No UNINSTALL of setFrequencies tool"
+endif
+
+install: install_daemon install_freq
+	@echo "===> INSTALL applications to $(BINPREFIX)"
+	@mkdir -p $(BINPREFIX)
+	@chmod 775 $(BINPREFIX)
+	@for APP in $(L_APPS); do \
+		install -m 755 $$APP  $(BINPREFIX); \
 	done
+	@for APP in $(C_APPS); do \
+		install -m 755 $$APP  $(BINPREFIX); \
+	done
+	@install -m 755 ext/lua/lua $(BINPREFIX)/likwid-lua
+	@echo "===> INSTALL helper applications to $(BINPREFIX)"
+	@install -m 755 perl/feedGnuplot $(BINPREFIX)
+	@echo "===> INSTALL lua to likwid interface to $(PREFIX)/share/lua"
+	@mkdir -p $(PREFIX)/share/lua
+	@chmod 775 $(PREFIX)/share/lua
+	@install -m 755 likwid.lua $(PREFIX)/share/lua
+	@echo "===> INSTALL libraries to $(LIBPREFIX)"
+	@mkdir -p $(LIBPREFIX)
+	@chmod 775 $(LIBPREFIX)
+	@install -m 755 $(TARGET_LIB) $(LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE)
+	@install -m 755 liblikwidpin.so $(LIBPREFIX)/liblikwidpin.so.$(VERSION).$(RELEASE)
+	@install -m 755 $(TARGET_HWLOC_LIB) $(LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE)
+	@install -m 755 $(TARGET_LUA_LIB) $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE)
+	@cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
+	@cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
+	@cd $(LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
+	@cd $(LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB).$(VERSION)
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB))
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION)
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB))
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION)
 	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
 	@mkdir -p $(MANPREFIX)/man1
+	@chmod 775 $(MANPREFIX)/man1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-topology.1 > $(MANPREFIX)/man1/likwid-topology.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-features.1 > $(MANPREFIX)/man1/likwid-features.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s+<PREFIX>+$(PREFIX)+g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-powermeter.1 > $(MANPREFIX)/man1/likwid-powermeter.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-pin.1 > $(MANPREFIX)/man1/likwid-pin.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/feedGnuplot.1 > $(MANPREFIX)/man1/feedGnuplot.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-accessD.1 > $(MANPREFIX)/man1/likwid-accessD.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genCfg.1 > $(MANPREFIX)/man1/likwid-genCfg.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genTopoCfg.1 > $(MANPREFIX)/man1/likwid-genTopoCfg.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-memsweeper.1 > $(MANPREFIX)/man1/likwid-memsweeper.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-features.1 > $(MANPREFIX)/man1/likwid-features.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-agent.1 > $(MANPREFIX)/man1/likwid-agent.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
+	@sed -e "s/.TH LUA/.TH LIKWID-LUA/g" -e "s/lua - Lua interpreter/likwid-lua - Lua interpreter included in LIKWID/g" -e "s/.B lua/.B likwid-lua/g" -e "s/.BR luac (1)//g" $(DOC_DIR)/likwid-lua.1 > $(MANPREFIX)/man1/likwid-lua.1
 	@chmod 644 $(MANPREFIX)/man1/likwid-*
 	@echo "===> INSTALL headers to $(PREFIX)/include"
-	@mkdir -p $(PREFIX)/include/likwid
-	@cp -f src/includes/likwid*.h  $(PREFIX)/include/
-	@cp -f src/includes/*  $(PREFIX)/include/likwid
-	@cp -f GCC/perfmon_group_types.h  $(PREFIX)/include/likwid
+	@mkdir -p $(PREFIX)/include
+	@chmod 775 $(PREFIX)/include
+	@install -m 644 src/includes/likwid.h  $(PREFIX)/include/
+	@install -m 644 src/includes/bstrlib.h  $(PREFIX)/include/
 	$(FORTRAN_INSTALL)
-	@echo "===> INSTALL libraries to $(PREFIX)/lib"
-	@mkdir -p $(PREFIX)/lib
-	@cp -f $(LIKWID_LIB)*  $(PREFIX)/lib
-	@chmod 755 $(PREFIX)/lib/$(PINLIB)
-	@echo "===> INSTALL filters to $(LIKWIDFILTERPATH)"
+	@echo "===> INSTALL groups to $(PREFIX)/share/likwid/perfgroups"
+	@mkdir -p $(PREFIX)/share/likwid/perfgroups
+	@chmod 775 $(PREFIX)/share/likwid
+	@chmod 775 $(PREFIX)/share/likwid/perfgroups
+	@cp -rf groups/* $(PREFIX)/share/likwid/perfgroups
+	@chmod 775 $(PREFIX)/share/likwid/perfgroups/*
+	@find $(PREFIX)/share/likwid/perfgroups -name "*.txt" -exec chmod 644 {} \;
+	@echo "===> INSTALL monitoring groups to $(PREFIX)/share/likwid/mongroups"
+	@mkdir -p $(PREFIX)/share/likwid/mongroups
+	@chmod 775 $(PREFIX)/share/likwid/mongroups
+	@cp -rf monitoring/groups/* $(PREFIX)/share/likwid/mongroups
+	@chmod 775 $(PREFIX)/share/likwid/mongroups/*
+	@find $(PREFIX)/share/likwid/mongroups -name "*.txt" -exec chmod 644 {} \;
+	@mkdir -p $(PREFIX)/share/likwid/docs
+	@chmod 775 $(PREFIX)/share/likwid/docs
+	@install -m 644 doc/bstrlib.txt $(PREFIX)/share/likwid/docs
+	@mkdir -p $(PREFIX)/share/likwid/examples
+	@chmod 775 $(PREFIX)/share/likwid/examples
+	@install -m 644 examples/* $(PREFIX)/share/likwid/examples
+	@echo "===> INSTALL default likwid-agent.conf to $(PREFIX)/share/likwid/mongroups"
+	@sed -e "s+<PREFIX>+$(PREFIX)+g" monitoring/likwid-agent.conf > $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
+	@chmod 644 $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
+	@echo "===> INSTALL filters to $(abspath $(PREFIX)/share/likwid/filter)"
+	@mkdir -p $(abspath $(PREFIX)/share/likwid/filter)
+	@chmod 755 $(abspath $(PREFIX)/share/likwid/filter)
+	@cp -f filters/*  $(abspath $(PREFIX)/share/likwid/filter)
+	@chmod 755 $(abspath $(PREFIX)/share/likwid/filter)/*
+
+
+move: move_daemon move_freq
+	@echo "===> MOVE applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
+	@mkdir -p $(INSTALLED_BINPREFIX)
+	@chmod 775 $(INSTALLED_BINPREFIX)
+	@for APP in $(L_APPS); do \
+		install -m 755 $(BINPREFIX)/$$APP  $(INSTALLED_BINPREFIX); \
+	done
+	@for APP in $(C_APPS); do \
+		install -m 755 $(BINPREFIX)/`basename $$APP`  $(INSTALLED_BINPREFIX); \
+	done
+	@install -m 755 $(BINPREFIX)/likwid-lua $(INSTALLED_BINPREFIX)/likwid-lua
+	@echo "===> MOVE helper applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
+	@install -m 755 $(BINPREFIX)/feedGnuplot $(INSTALLED_BINPREFIX)
+	@echo "===> MOVE lua to likwid interface from $(PREFIX)/share/lua to $(INSTALLED_PREFIX)/share/lua"
+	@mkdir -p $(INSTALLED_PREFIX)/share/lua
+	@chmod 775 $(INSTALLED_PREFIX)/share/lua
+	@install -m 755 $(PREFIX)/share/lua/likwid.lua $(INSTALLED_PREFIX)/share/lua
+	@echo "===> MOVE libraries from $(LIBPREFIX) to $(INSTALLED_LIBPREFIX)"
+	@mkdir -p $(INSTALLED_LIBPREFIX)
+	@chmod 775 $(INSTALLED_LIBPREFIX)
+	@install -m 755 $(LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(TARGET_LIB).$(VERSION).$(RELEASE)
+	@install -m 755 $(LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE)
+	@install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE)
+	@install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB).$(VERSION)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB))
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB))
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION)
+	@echo "===> MOVE man pages from $(MANPREFIX)/man1 to $(INSTALLED_MANPREFIX)/man1"
+	@mkdir -p $(INSTALLED_MANPREFIX)/man1
+	@chmod 775 $(INSTALLED_MANPREFIX)/man1
+	@install -m 644 $(MANPREFIX)/man1/*.1 $(INSTALLED_MANPREFIX)/man1
+	@echo "===> MOVE headers from $(PREFIX)/include to $(INSTALLED_PREFIX)/include"
+	@mkdir -p $(INSTALLED_PREFIX)/include
+	@chmod 775 $(INSTALLED_PREFIX)/include
+	@install -m 644 $(PREFIX)/include/likwid.h $(INSTALLED_PREFIX)/include/likwid.h
+	@install -m 644 $(PREFIX)/include/bstrlib.h $(INSTALLED_PREFIX)/include/bstrlib.h
+	@if [ -e $(PREFIX)/include/likwid.mod ]; then install $(PREFIX)/include/likwid.mod $(INSTALLED_PREFIX)/include/likwid.mod; fi
+	@echo "===> MOVE groups from $(PREFIX)/share/likwid/perfgroups to $(INSTALLED_PREFIX)/share/likwid/perfgroups"
+	@mkdir -p $(INSTALLED_PREFIX)/share/likwid/perfgroups
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/perfgroups
+	@cp -rf $(PREFIX)/share/likwid/perfgroups/* $(INSTALLED_PREFIX)/share/likwid/perfgroups
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/perfgroups/*
+	@find $(INSTALLED_PREFIX)/share/likwid/perfgroups -name "*.txt" -exec chmod 644 {} \;
+	@echo "===> MOVE monitoring groups from $(PREFIX)/share/likwid/mongroups to $(INSTALLED_PREFIX)/share/likwid/mongroups"
+	@mkdir -p $(INSTALLED_PREFIX)/share/likwid/mongroups
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/mongroups
+	@cp -rf $(PREFIX)/share/likwid/mongroups/* $(INSTALLED_PREFIX)/share/likwid/mongroups
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/mongroups/*
+	@find $(INSTALLED_PREFIX)/share/likwid/mongroups -name "*.txt" -exec chmod 644 {} \;
+	@mkdir -p $(INSTALLED_PREFIX)/share/likwid/docs
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/docs
+	@install -m 644 $(PREFIX)/share/likwid/docs/bstrlib.txt $(INSTALLED_PREFIX)/share/likwid/docs
+	@mkdir -p $(INSTALLED_PREFIX)/share/likwid/examples
+	@chmod 775 $(INSTALLED_PREFIX)/share/likwid/examples
+	@install -m 644 examples/* $(INSTALLED_PREFIX)/share/likwid/examples
+	@echo "===> MOVE default likwid-agent.conf from $(PREFIX)/share/likwid/mongroups to $(INSTALLED_PREFIX)/share/likwid/mongroups"
+	@install $(PREFIX)/share/likwid/mongroups/likwid-agent.conf $(INSTALLED_PREFIX)/share/likwid/mongroups/likwid-agent.conf
+	@chmod 644 $(INSTALLED_PREFIX)/share/likwid/mongroups/likwid-agent.conf
+	@echo "===> MOVE filters from $(abspath $(PREFIX)/share/likwid/filter) to $(LIKWIDFILTERPATH)"
 	@mkdir -p $(LIKWIDFILTERPATH)
-	@cp -f filters/*  $(LIKWIDFILTERPATH)
+	@chmod 755 $(LIKWIDFILTERPATH)
+	@cp -f $(abspath $(PREFIX)/share/likwid/filter)/* $(LIKWIDFILTERPATH)
 	@chmod 755 $(LIKWIDFILTERPATH)/*
 
-uninstall:
+
+uninstall: uninstall_daemon uninstall_freq
 	@echo "===> REMOVING applications from $(PREFIX)/bin"
-	@rm -f $(addprefix $(PREFIX)/bin/,$(APPS))
-	@rm -f $(addprefix $(PREFIX)/bin/,$(PERL_APPS))
-	@rm -f $(PREFIX)/bin/feedGnuplot
-	@echo "===> REMOVING daemon applications from $(PREFIX)/sbin"
-	@rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS))
+	@rm -f $(addprefix $(BINPREFIX)/,$(addsuffix  .lua,$(L_APPS)))
+	@for APP in $(L_APPS); do \
+		rm -f $(BINPREFIX)/$$APP; \
+	done
+	@for APP in $(C_APPS); do \
+		rm -f $(BINPREFIX)/$$APP; \
+	done
+	@rm -f $(BINPREFIX)/feedGnuplot
+	@rm -f $(BINPREFIX)/likwid-lua
+	@rm -f $(BINPREFIX)/likwid-bench
+	@echo "===> REMOVING Lua to likwid interface from $(PREFIX)/share/lua"
+	@rm -rf  $(PREFIX)/share/lua/likwid.lua
+	@echo "===> REMOVING libs from $(LIBPREFIX)"
+	@rm -f $(LIBPREFIX)/liblikwid*
 	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
-	@rm -f $(MANPREFIX)/man1/likwid-*
+	@rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix  .1,$(L_APPS)))
 	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
-	@echo "===> REMOVING headers from $(PREFIX)/include"
-	@rm -f $(PREFIX)/include/likwid*.h
-	@rm -rf $(PREFIX)/include/likwid
-	@echo "===> REMOVING libs from $(PREFIX)/lib"
-	@rm -f $(PREFIX)/lib/$(LIKWID_LIB)*
-	@echo "===> REMOVING filter from $(PREFIX)/share"
-	@rm -rf  $(PREFIX)/share/likwid
-
-
-
+	@rm -f $(MANPREFIX)/man1/likwid-setFreq.1
+	@rm -f $(MANPREFIX)/man1/likwid-accessD.1
+	@rm -f $(MANPREFIX)/man1/likwid-lua.1
+	@rm -f $(MANPREFIX)/man1/likwid-bench.1
+	@echo "===> REMOVING header from $(PREFIX)/include"
+	@rm -f $(PREFIX)/include/likwid.h
+	@rm -f $(PREFIX)/include/bstrlib.h
+	$(FORTRAN_REMOVE)
+	@echo "===> REMOVING filter, groups and default configs from $(PREFIX)/share/likwid"
+	@rm -rf $(abspath $(PREFIX)/share/likwid/filter)
+	@rm -rf $(PREFIX)/share/likwid/mongroups
+	@rm -rf $(PREFIX)/share/likwid/perfgroups
+	@rm -rf $(PREFIX)/share/likwid/docs
+	@rm -rf $(PREFIX)/share/likwid/examples
+	@rm -rf $(PREFIX)/share/likwid
+
+uninstall_moved: uninstall_daemon_moved uninstall_freq_moved
+	@echo "===> REMOVING applications from $(INSTALLED_PREFIX)/bin"
+	@rm -f $(addprefix $(INSTALLED_BINPREFIX)/,$(addsuffix  .lua,$(L_APPS)))
+	@for APP in $(L_APPS); do \
+		rm -f $(INSTALLED_BINPREFIX)/$$APP; \
+	done
+	@for APP in $(C_APPS); do \
+		rm -f $(INSTALLED_BINPREFIX)/$$APP; \
+	done
+	@rm -f $(INSTALLED_BINPREFIX)/feedGnuplot
+	@rm -f $(INSTALLED_BINPREFIX)/likwid-lua
+	@rm -f $(INSTALLED_BINPREFIX)/likwid-bench
+	@echo "===> REMOVING Lua to likwid interface from $(INSTALLED_PREFIX)/share/lua"
+	@rm -rf  $(INSTALLED_PREFIX)/share/lua/likwid.lua
+	@echo "===> REMOVING libs from $(INSTALLED_LIBPREFIX)"
+	@rm -f $(INSTALLED_LIBPREFIX)/liblikwid*
+	@echo "===> REMOVING man pages from $(INSTALLED_MANPREFIX)/man1"
+	@rm -f $(addprefix $(INSTALLED_MANPREFIX)/man1/,$(addsuffix  .1,$(L_APPS)))
+	@rm -f $(INSTALLED_MANPREFIX)/man1/feedGnuplot.1
+	@rm -f $(INSTALLED_MANPREFIX)/man1/likwid-setFreq.1
+	@rm -f $(INSTALLED_MANPREFIX)/man1/likwid-accessD.1
+	@rm -f $(INSTALLED_MANPREFIX)/man1/likwid-lua.1
+	@rm -f $(INSTALLED_MANPREFIX)/man1/likwid-bench.1
+	@echo "===> REMOVING header from $(INSTALLED_PREFIX)/include"
+	@rm -f $(INSTALLED_PREFIX)/include/likwid.h
+	@rm -f $(INSTALLED_PREFIX)/include/bstrlib.h
+	$(FORTRAN_REMOVE)
+	@echo "===> REMOVING filter, groups and default configs from $(INSTALLED_PREFIX)/share/likwid"
+	@rm -rf $(LIKWIDFILTERPATH)
+	@rm -rf $(INSTALLED_PREFIX)/share/likwid/mongroups
+	@rm -rf $(INSTALLED_PREFIX)/share/likwid/perfgroups
+	@rm -rf $(INSTALLED_PREFIX)/share/likwid/docs
+	@rm -rf $(INSTALLED_PREFIX)/share/likwid/examples
+	@rm -rf $(INSTALLED_PREFIX)/share/likwid
+
+local: $(L_APPS) likwid.lua
+	@echo "===> Setting Lua scripts to run from current directory"
+	@PWD=$(shell pwd)
+	@for APP in $(L_APPS); do \
+		sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" $$APP; \
+		chmod +x $$APP; \
+	done
+	@sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/lib+$(PWD)+g" -e "s+$(PREFIX)/share/likwid/perfgroups+$(PWD)/groups+g" likwid.lua;
+	@sed -i -e "s+$(PREFIX)/share/likwid/mongroups+$(PWD)/monitoring/groups+g" likwid-agent
+	@ln -sf liblikwid.so liblikwid.so.$(VERSION)
+	@ln -sf ext/hwloc/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION)
+	@ln -sf ext/lua/liblikwid-lua.so liblikwid-lua.so.$(VERSION)
+	@ln -sf liblikwid.so liblikwid.so.$(VERSION).$(RELEASE)
+	@ln -sf ext/hwloc/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION).$(RELEASE)
+	@ln -sf ext/lua/liblikwid-lua.so liblikwid-lua.so.$(VERSION).$(RELEASE)
+	@echo "export LD_LIBRARY_PATH=$(PWD):$$LD_LIBRARY_PATH"
+
+testit: test/test-likwidAPI.c
+	make -C test test-likwidAPI
+	test/test-likwidAPI
+	make -C test/executable_tests
+
+help:
+	@echo "Help for building LIKWID:"
+	@echo
+	@echo "Common make targets:"
+	@echo "- make : build anything (integrate already compiled files)"
+	@echo "- make clean : clean library and executables, keep compiled files"
+	@echo "- make distclean : clean anything"
+	@echo "- make docs : Build documentation (requires Doxygen)"
+	@echo "- make install : Copy compiled files to $(PREFIX)"
+	@echo "- make move : Copy files from $(PREFIX) to $(INSTALLED_PREFIX)"
+	@echo "- make uninstall : Delete files from $(PREFIX)"
+	@echo "- make uninstall_moved : Delete files from $(INSTALLED_PREFIX)"
+	@echo
+	@echo "Compiler selection can be done in config.mk at COMPILER:"
+	@echo "- GCC : Use GCC for C code and Intel Fortran compiler for Fortran interface (default)"
+	@echo "- GCCX86 : Use GCC for C code. No Fortran compiler set (only for 32 bit builds)"
+	@echo "- CLANG: Use CLANG for C code and Intel Fortran compiler for Fortran interface (unsupported, may fail)"
+	@echo "- ICC: Use Intel C compiler for C code and Intel Fortran compiler for Fortran interface (unsupported, may fail)"
+	@echo "- MIC: Build for Intel Xeon Phi. Use Intel C compiler for C code and\n       Intel Fortran compiler for Fortran interface (unsupported)"
+	@echo
+	@echo "LIKWID runs only in INSTALLED_PREFIX = $(INSTALLED_PREFIX)"
+	@echo "You can change it in config.mk, but it is recommended to keep INSTALLED_PREFIX = PREFIX"
+	@echo "The PREFIX is used for temporary install directories (e.g. for packaging)."
+	@echo "LIKWID will not run in PREFIX, it has to be in INSTALLED_PREFIX."
+	@echo "The common configuration is INSTALLED_PREFIX = PREFIX, so changing PREFIX is enough."
+	@echo "If PREFIX and INSTALLED_PREFIX differ, you have to move anything after 'make install' to"
+	@echo "the INSTALLED_PREFIX. You can also use 'make move' which does the job for you."
+	
diff --git a/README b/README
deleted file mode 100644
index f47ac01..0000000
--- a/README
+++ /dev/null
@@ -1,29 +0,0 @@
-Likwid is a simple to install and use toolsuite of command line applications
-for performance oriented programmers. It works for Intel and AMD processors
-on the Linux operating system.
-
-It consists of:
-
-likwid-topology       - print thread and cache topology
-likwid-features       - view and toggle feature reagister on Intel processors
-likwid-perfctr        - configure and read out hardware performance counters on Intel and AMD processors
-likwid-powermeter     - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-setFrequencies - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-memsweeper     - cleans up filled NUMA memory domains and evicts dirty cacheline from cache hierarchy
-likwid-pin            - pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors
-likwid-bench          - Micro benchmarking platform
-likwid-gencfg         - Dumps topology information to a file
-likwid-mpirun         - Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI and OpenMPI)
-likwid-scope          - Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics
-
-For a detailed  documentation on the usage of the tools have a look at the
-likwid wiki pages at:
-
-http://code.google.com/p/likwid/wiki/Introduction
-
-If you have problems or suggestions please let us know on the likwid mailing list:
-
-http://groups.google.com/group/likwid-users
-
-
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..838883c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+--------------------------------------------------------------------------------
+Introduction
+--------------------------------------------------------------------------------
+Likwid is a simple to install and use toolsuite of command line applications
+for performance oriented programmers. It works for Intel and AMD processors
+on the Linux operating system.
+
+[![Build Status](https://travis-ci.org/RRZE-HPC/likwid.svg?branch=master)](https://travis-ci.org/RRZE-HPC/likwid)
+
+It consists of:
+
+- likwid-topology: print thread, cache and NUMA topology
+- likwid-perfctr: configure and read out hardware performance counters on Intel and AMD processors
+- likwid-powermeter: read out RAPL Energy information and get info about Turbo mode steps
+- likwid-pin: pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors)
+- likwid-bench: Micro benchmarking platform
+- likwid-genTopoCfg: Dumps topology information to a file
+- likwid-mpirun: Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI, OpenMPI and MPICH)
+- likwid-perfscope: Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics using gnuplot
+- likwid-agent: Monitoring agent for hardware performance counters
+- likwid-memsweeper: Sweep memory of NUMA domains and evict cachelines from the last level cache
+- likwid-setFrequencies: Tool to control the CPU frequency
+
+--------------------------------------------------------------------------------
+Download, Build and Install
+--------------------------------------------------------------------------------
+You can get the releases of LIKWID at:
+http://ftp.fau.de/pub/likwid/
+
+For build and installation hints see INSTALL file
+
+--------------------------------------------------------------------------------
+Documentation
+--------------------------------------------------------------------------------
+For a detailed  documentation on the usage of the tools have a look at the
+html documentation build with doxygen. Call
+
+make docs
+
+or after installation, look at the man pages.
+
+There is also a wiki at the github page:
+https://github.com/rrze-likwid/likwid/wiki
+
+If you have problems or suggestions please let me know on the likwid mailing list:
+http://groups.google.com/group/likwid-users
+
+or if it is bug, add an issue at:
+https://github.com/rrze-likwid/likwid/issues
+
+--------------------------------------------------------------------------------
+Extras
+--------------------------------------------------------------------------------
+- If you want to use the Marker API with Java, you can find the Java module here:
+https://github.com/jlewandowski/likwid-java-api
+- For Python you can find an interface to the LIKWID API here:
+https://github.com/TomTheBear/likwid-python-api
+
diff --git a/bench/Makefile b/bench/Makefile
new file mode 100644
index 0000000..da883ef
--- /dev/null
+++ b/bench/Makefile
@@ -0,0 +1,157 @@
+#
+# =======================================================================================
+#
+#      Filename:  Makefile
+#
+#      Description:  likwid-bench Makefile
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2013 Jan Treibig
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+SRC_DIR     = ./src
+MAKE_DIR    = ../make
+
+#DO NOT EDIT BELOW
+
+
+# Dependency chains:
+# *.[ch] -> *.o -> executables
+# *.ptt -> *.pas -> *.s -> *.o -> executables
+# *.txt -> *.h (generated)
+
+include ../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+#INCLUDES  += -I./includes -I../src/includes -I../ext/hwloc/include -I../$(COMPILER) -I$(BUILD_DIR)
+INCLUDES  += -I./includes -I$(BUILD_DIR) -I../src/includes
+LIBS      +=
+CFLAGS := $(filter-out -fvisibility=hidden, $(CFLAGS))
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+Q         ?= @
+
+ifeq ($(COMPILER),MIC)
+BENCH_DIR   = ./phi
+else
+ifeq ($(COMPILER),GCCX86)
+BENCH_DIR   = ./x86
+else
+BENCH_DIR   = ./x86-64
+endif
+endif
+
+SHARED_TARGET_LIB := -L.. -L../ext/hwloc/ -L../ext/lua -llikwid -llikwid-hwloc -llikwid-lua
+STATIC_TARGET_LIB := ../liblikwid.a ../ext/hwloc/liblikwid-hwloc.a ../ext/lua/liblikwid-lua.a
+TARGET_LIB = $(SHARED_TARGET_LIB)
+
+BENCH_LIBS :=
+ifeq ($(INSTRUMENT_BENCH),true)
+	DEFINES += -DLIKWID_PERFMON
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
+
+
+VPATH     = $(SRC_DIR)
+OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+ifeq ($(SHARED_LIBRARY),false)
+OBJ := $(filter-out $(BUILD_DIR)/bstrlib.o,$(OBJ))
+TARGET_LIB = $(STATIC_TARGET_LIB)
+endif
+OBJ_BENCH  =  $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
+BENCH = $(shell basename $(BENCH_TARGET))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(OBJ_BENCH) $(BENCH_TARGET)
+
+
+$(BENCH_TARGET): $(BENCH)
+$(BENCH): likwid-bench.c $(BUILD_DIR) $(OBJ) $(OBJ_BENCH)
+	@echo "===>  LINKING  $(BENCH)"
+	$(Q)${CC} $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS}  likwid-bench.c $(BENCH_LIBS) $(OBJ_BENCH) $(OBJ) -o $(BENCH) $(TARGET_LIB) $(LIBS) $(RPATHS)
+
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	@echo "===>  COMPILE C $@"
+	$(Q)$(CC) -g -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(TARGET_LIB) $< -o $@
+	$(Q)$(CC) -g $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+
+$(BUILD_DIR)/%.pas:  $(BENCH_DIR)/%.ptt
+	@echo "===>  GENERATE BENCHMARKS"
+	$(Q)$(GEN_PAS) $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
+
+
+$(BUILD_DIR)/%.o:  $(BUILD_DIR)/%.pas
+	@echo "===>  ASSEMBLE  $@"
+	$(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $<  '$(DEFINES)'
+	$(Q)$(AS) $(ASFLAGS)  $(BUILD_DIR)/$*.s -o $@
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean install uninstall
+
+
+.PRECIOUS: $(BUILD_DIR)/%.pas
+
+.NOTPARALLEL:
+
+
+clean:
+	@rm -rf likwid-bench
+
+distclean:
+	@rm -rf $(BUILD_DIR)
+	@rm -rf likwid-bench
+
+install:
+	@echo "===> INSTALL applications to $(BINPREFIX)"
+	cp -f likwid-bench $(BINPREFIX)
+	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
+	@mkdir -p $(MANPREFIX)/man1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+
+
+uninstall:
+	@echo "===> REMOVING applications from $(BINPREFIX)"
+	rm -rf $(BINPREFIX)/likwid-bench
+	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
+	@rm -f $(MANPREFIX)/man1/likwid-bench.1
+
+
+
diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h
new file mode 100644
index 0000000..f7eae06
--- /dev/null
+++ b/bench/includes/allocator.h
@@ -0,0 +1,50 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  allocator.h
+ *
+ *      Description:  Header File allocator Module. 
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  none
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+#include <test_types.h>
+
+#define LLU_CAST (unsigned long long)
+
+extern void allocator_init(int numVectors);
+extern void allocator_finalize();
+extern void allocator_allocateVector(void** ptr,
+        int alignment,
+        uint64_t size,
+        int offset,
+        DataType type,
+        bstring domain);
+
+#endif /*ALLOCATOR_H*/
+
diff --git a/bench/includes/allocator_types.h b/bench/includes/allocator_types.h
new file mode 100644
index 0000000..43ad3c0
--- /dev/null
+++ b/bench/includes/allocator_types.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  allocator_types.h
+ *
+ *      Description:  Header File types of allocator Module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  none
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ALLOCATOR_TYPES_H
+#define ALLOCATOR_TYPES_H
+
+#include <stdint.h>
+#include <test_types.h>
+
+typedef struct {
+    void* ptr;
+    size_t size;
+    off_t offset;
+    DataType type;
+} allocation;
+
+
+
+#endif
diff --git a/bench/includes/barrier.h b/bench/includes/barrier.h
new file mode 100644
index 0000000..6427c4a
--- /dev/null
+++ b/bench/includes/barrier.h
@@ -0,0 +1,58 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier.h
+ *
+ *      Description:  Header File barrier Module
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_H
+#define BARRIER_H
+
+#include <barrier_types.h>
+
+/**
+ * @brief  Initialize the barrier module
+ * @param  numberOfThreads The total number of threads in the barrier
+ */
+extern void barrier_init(int numberOfGroups);
+
+/**
+ * @brief  Register a thread for a barrier
+ * @param  threadId The id of the thread to register
+ */
+extern int barrier_registerGroup(int numThreads);
+extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
+
+/**
+ * @brief  Synchronize threads
+ * @param  threadId The id of the calling thread
+ * @param  numberOfThreads Total number of threads in the barrier
+ */
+extern void  barrier_synchronize(BarrierData* barr);
+extern void  barrier_destroy(BarrierData* barr);
+
+
+#endif /*BARRIER_H*/
diff --git a/bench/includes/barrier_types.h b/bench/includes/barrier_types.h
new file mode 100644
index 0000000..9fc6e30
--- /dev/null
+++ b/bench/includes/barrier_types.h
@@ -0,0 +1,49 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier_types.h
+ *
+ *      Description:  Type Definitions for barrier Module
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_TYPES_H
+#define BARRIER_TYPES_H
+
+#include <stdint.h>
+
+typedef struct {
+    int        numberOfThreads;
+    int        offset;
+    int        val;
+    int*       index;
+    volatile int*  bval;
+} BarrierData;
+
+typedef struct {
+    int*       groupBval;
+    int        numberOfThreads;
+} BarrierGroup;
+
+#endif /*BARRIER_TYPES_H*/
diff --git a/bench/includes/bstrlib.h b/bench/includes/bstrlib.h
new file mode 120000
index 0000000..daa8a68
--- /dev/null
+++ b/bench/includes/bstrlib.h
@@ -0,0 +1 @@
+../../src/includes/bstrlib.h
\ No newline at end of file
diff --git a/bench/includes/likwid.h b/bench/includes/likwid.h
new file mode 120000
index 0000000..d2020f7
--- /dev/null
+++ b/bench/includes/likwid.h
@@ -0,0 +1 @@
+../../src/includes/likwid.h
\ No newline at end of file
diff --git a/bench/includes/strUtil.h b/bench/includes/strUtil.h
new file mode 100644
index 0000000..a16790c
--- /dev/null
+++ b/bench/includes/strUtil.h
@@ -0,0 +1,60 @@
+/*
+ * =======================================================================================
+ *      Filename:  strUtil.h
+ *
+ *      Description:  Some sting functions
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef STRUTIL_H
+#define STRUTIL_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <bstrlib.h>
+#include <likwid.h>
+
+#include <test_types.h>
+
+typedef struct {
+    bstring domain;
+    int offset;
+    void* ptr;
+} Stream;
+
+typedef struct {
+    uint32_t numberOfThreads;
+    int* processorIds;
+    uint64_t size;
+    Stream* streams;
+} Workgroup;
+
+
+extern int bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams);
+extern void workgroups_destroy(Workgroup** groupList, int numberOfGroups, int numberOfStreams);
+
+#endif
diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h
new file mode 100644
index 0000000..18627fc
--- /dev/null
+++ b/bench/includes/test_types.h
@@ -0,0 +1,113 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  test_types.h
+ *
+ *      Description:  Type definitions for benchmarking framework
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef TEST_TYPES_H
+#define TEST_TYPES_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+
+typedef void (*FuncPrototype)();
+
+typedef enum {
+    SINGLE = 0,
+    DOUBLE,
+    INT} DataType;
+
+typedef enum {
+    STREAM_1 = 1,
+    STREAM_2,
+    STREAM_3,
+    STREAM_4,
+    STREAM_5,
+    STREAM_6,
+    STREAM_7,
+    STREAM_8,
+    STREAM_9,
+    STREAM_10,
+    STREAM_11,
+    STREAM_12,
+    STREAM_13,
+    STREAM_14,
+    STREAM_15,
+    STREAM_16,
+    STREAM_17,
+    STREAM_18,
+    STREAM_19,
+    STREAM_20,
+    STREAM_21,
+    STREAM_22,
+    STREAM_23,
+    STREAM_24,
+    STREAM_25,
+    STREAM_26,
+    STREAM_27,
+    STREAM_28,
+    STREAM_29,
+    STREAM_30,
+    STREAM_31,
+    STREAM_32,
+    STREAM_33,
+    STREAM_34,
+    STREAM_35,
+    STREAM_36,
+    STREAM_37,
+    STREAM_38,
+    MAX_STREAMS} Pattern;
+
+typedef struct {
+    char* name;
+    Pattern streams;
+    DataType type ;
+    int stride;
+    FuncPrototype kernel;
+    int  flops;
+    int  bytes;
+    char* desc;
+    int loads;
+    int stores;
+    int branches;
+    int instr_const;
+    int instr_loop;
+    int uops;
+} TestCase;
+
+typedef struct {
+    uint64_t   size;
+    uint64_t   iter;
+    uint32_t   min_runtime;
+    const TestCase* test;
+    uint64_t   cycles;
+    uint32_t numberOfThreads;
+    int* processors;
+    void** streams;
+} ThreadUserData;
+
+#endif /*TEST_TYPES_H*/
diff --git a/bench/includes/threads.h b/bench/includes/threads.h
new file mode 100644
index 0000000..d92bbc9
--- /dev/null
+++ b/bench/includes/threads.h
@@ -0,0 +1,114 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads.h
+ *
+ *      Description:  Header file of pthread interface module
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_H
+#define THREADS_H
+
+#include <pthread.h>
+#include <threads_types.h>
+
+#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
+#define MIN_ITERATIONS 10
+
+extern pthread_barrier_t threads_barrier;
+extern ThreadData* threads_data;
+extern ThreadGroup* threads_groups;
+
+
+/**
+ * @brief  Test the maximal possible thread count
+ * @return  numberOfThreads  The number of available threads
+ */
+extern int threads_test(void);
+
+/**
+ * @brief  Initialization of the thread module
+ * @param  numberOfThreads  The total number of threads
+ */
+extern void threads_init(int numberOfThreads);
+
+/**
+ * @brief  Create all threads
+ * @param  startRoutine thread entry function pointer
+ */
+extern void threads_create(void *(*startRoutine)(void*));
+
+/**
+ * @brief  Register User thread data for all threads
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataAll(
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+/**
+ * @brief  Register User thread data for one thread
+ * @param  threadId thread Id 
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataThread(
+        int threadId,
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+/**
+ * @brief  Register User thread data for a thread group
+ * @param  groupId  group Id
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataGroup(
+        int groupId,
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+extern size_t threads_updateIterations(int groupId, size_t demandIter);
+
+/**
+ * @brief  Join the threads and free pthread related data structures
+ * @param
+ */
+extern void threads_join(void);
+
+/**
+ * @brief  Free memory of thread data structures
+ * @param  numberOfGroups The number of groups to destroy
+ */
+extern void threads_destroy(int numberOfGroups, int numberOfStreams);
+
+/**
+ * @brief  Create Thread groups
+ * @param  numberOfGroups The number of groups to create
+ */
+extern void threads_createGroups(int numberOfGroups);
+
+#endif /* THREADS_H */
diff --git a/bench/includes/threads_types.h b/bench/includes/threads_types.h
new file mode 100644
index 0000000..68f0af3
--- /dev/null
+++ b/bench/includes/threads_types.h
@@ -0,0 +1,56 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads_types.h
+ *
+ *      Description:  Types file for threads module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_TYPES_H
+#define THREADS_TYPES_H
+
+#include <stdint.h>
+#include <test_types.h>
+
+typedef struct {
+    int        globalNumberOfThreads;
+    int        numberOfThreads;
+    int        globalThreadId;
+    int        threadId;
+    int        numberOfGroups;
+    int        groupId;
+    double     time;
+    uint64_t   cycles;
+    ThreadUserData data;
+} ThreadData;
+
+typedef struct {
+    int        numberOfThreads;
+    int*       threadIds;
+} ThreadGroup;
+
+typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
+
+#endif /*THREADS_TYPES_H*/
diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c
new file mode 100644
index 0000000..02d0ced
--- /dev/null
+++ b/bench/likwid-bench.c
@@ -0,0 +1,521 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid-bench.c
+ *
+ *      Description:  A flexible and extensible benchmarking toolbox
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <inttypes.h>
+
+#include <bstrlib.h>
+#include <errno.h>
+#include <threads.h>
+#include <barrier.h>
+#include <testcases.h>
+#include <strUtil.h>
+#include <allocator.h>
+
+#include <likwid.h>
+
+extern void* runTest(void* arg);
+extern void* getIterSingle(void* arg);
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define HELP_MSG printf("Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    printf("\n"); \
+    printf("Supported Options:\n"); \
+    printf("-h\t\t Help message\n"); \
+    printf("-a\t\t List available benchmarks \n"); \
+    printf("-d\t\t Delimiter used for physical core list (default ,) \n"); \
+    printf("-p\t\t List available thread domains\n"); \
+    printf("\t\t or the physical ids of the cores selected by the -c expression \n"); \
+    printf("-s <TIME>\t Seconds to run the test minimally (default 1)\n");\
+    printf("\t\t If resulting iteration count is below 10, it is normalized to 10.\n");\
+    printf("-i <ITERS>\t Specify the number of iterations per thread manually. \n"); \
+    printf("-l <TEST>\t list properties of benchmark \n"); \
+    printf("-t <TEST>\t type of test \n"); \
+    printf("-w\t\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>]-<streamId>:<domain_id>[:<offset>]\n"); \
+    printf("\t\t <size> in kB, MB or GB  (mandatory)\n"); \
+    printf("\n"); \
+    printf("Usage: \n"); \
+    printf("# Run the store benchmark on all CPUs of the system with a vector size of 1 GB\n"); \
+    printf("likwid-bench -t store -w S0:1GB\n"); \
+    printf("# Run the copy benchmark on one CPU at CPU socket 0 with a vector size of 100kB\n"); \
+    printf("likwid-bench -t copy -w S0:100kB:1\n"); \
+    printf("# Run the copy benchmark on one CPU at CPU socket 0 with a vector size of 100MB but place one stream on CPU socket 1\n"); \
+    printf("likwid-bench -t copy -w S0:100MB:1-0:S0,1:S1\n"); \
+
+#define VERSION_MSG \
+    printf("likwid-bench   %d.%d \n\n",VERSION,RELEASE)
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ############ */
+
+    void
+copyThreadData(ThreadUserData* src,ThreadUserData* dst)
+{
+    uint32_t i;
+
+    *dst = *src;
+    dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
+    dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
+
+    for (i=0; i<  src->test->streams; i++)
+    {
+        dst->streams[i] = src->streams[i];
+    }
+
+    for (i=0; i<src->numberOfThreads; i++)
+    {
+        dst->processors[i] = src->processors[i];
+    }
+}
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int main(int argc, char** argv)
+{
+    uint64_t iter = 100;
+    uint32_t i;
+    uint32_t j;
+    int globalNumberOfThreads = 0;
+    int optPrintDomains = 0;
+    int c;
+    ThreadUserData myData;
+    bstring testcase = bfromcstr("none");
+    uint64_t numberOfWorkgroups = 0;
+    int tmp = 0;
+    double time;
+    double cycPerUp = 0.0;
+    const TestCase* test = NULL;
+    uint64_t realSize = 0;
+    uint64_t realIter = 0;
+    uint64_t maxCycles = 0;
+    uint64_t minCycles = UINT64_MAX;
+    uint64_t cyclesClock = 0;
+    uint64_t demandIter = 0;
+    TimerData itertime;
+    Workgroup* currentWorkgroup = NULL;
+    Workgroup* groups = NULL;
+    uint32_t min_runtime = 1; /* 1s */
+    bstring HLINE = bfromcstr("");
+    binsertch(HLINE, 0, 80, '-');
+    binsertch(HLINE, 80, 1, '\n');
+    int (*ownprintf)(const char *format, ...);
+    ownprintf = &printf;
+
+    /* Handling of command line options */
+    if (argc ==  1)
+    {
+        HELP_MSG;
+        exit(EXIT_SUCCESS);
+    }
+
+    while ((c = getopt (argc, argv, "w:t:s:l:aphvi:")) != -1) {
+        switch (c)
+        {
+            case 'h':
+                HELP_MSG;
+                exit (EXIT_SUCCESS);
+            case 'v':
+                VERSION_MSG;
+                exit (EXIT_SUCCESS);
+            case 'a':
+                ownprintf(TESTS"\n");
+                exit (EXIT_SUCCESS);
+            case 'w':
+                numberOfWorkgroups++;
+                break;
+            case 's':
+                min_runtime = atoi(optarg);
+                break;
+            case 'i':
+                demandIter = strtoul(optarg, NULL, 10);
+                if (demandIter <= 0)
+                {
+                    fprintf (stderr, "Error: Iterations must be greater than 0\n");
+                    return EXIT_FAILURE;
+                }
+                break;
+            case 'l':
+                bdestroy(testcase);
+                testcase = bfromcstr(optarg);
+                for (i=0; i<NUMKERNELS; i++)
+                {
+                    if (biseqcstr(testcase, kernels[i].name))
+                    {
+                        test = kernels+i;
+                        break;
+                    }
+                }
+
+                if (test == NULL)
+                {
+                    fprintf (stderr, "Error: Unknown test case %s\n",optarg);
+                    return EXIT_FAILURE;
+                }
+                else
+                {
+                    ownprintf("Name: %s\n",test->name);
+                    ownprintf("Number of streams: %d\n",test->streams);
+                    ownprintf("Loop stride: %d\n",test->stride);
+                    ownprintf("Flops: %d\n",test->flops);
+                    ownprintf("Bytes: %d\n",test->bytes);
+                    switch (test->type)
+                    {
+                        case INT:
+                            ownprintf("Data Type: Integer\n");
+                            break;
+                        case SINGLE:
+                            ownprintf("Data Type: Single precision float\n");
+                            break;
+                        case DOUBLE:
+                            ownprintf("Data Type: Double precision float\n");
+                            break;
+                    }
+                    if (test->loads >= 0)
+                    {
+                        ownprintf("Load Ops: %d\n",test->loads);
+                    }
+                    if (test->stores >= 0)
+                    {
+                        ownprintf("Store Ops: %d\n",test->stores);
+                    }
+                    if (test->branches >= 0)
+                    {
+                        ownprintf("Branches: %d\n",test->branches);
+                    }
+                    if (test->instr_const >= 0)
+                    {
+                        ownprintf("Constant instructions: %d\n",test->instr_const);
+                    }
+                    if (test->instr_loop >= 0)
+                    {
+                        ownprintf("Loop instructions: %d\n",test->instr_loop);
+                    }
+                }
+                bdestroy(testcase);
+                exit (EXIT_SUCCESS);
+
+                break;
+            case 'p':
+                optPrintDomains = 1;
+                break;
+            case 'g':
+                numberOfWorkgroups = LLU_CAST atol(optarg);
+
+                tmp = numberOfWorkgroups;
+
+                break;
+            case 't':
+                bdestroy(testcase);
+                testcase = bfromcstr(optarg);
+
+                for (i=0; i<NUMKERNELS; i++)
+                {
+                    if (biseqcstr(testcase, kernels[i].name))
+                    {
+                        test = kernels+i;
+                        break;
+                    }
+                }
+
+                if (test == NULL)
+                {
+                    fprintf (stderr, "Error: Unknown test case %s\n",optarg);
+                    return EXIT_FAILURE;
+                }
+                bdestroy(testcase);
+                break;
+            case '?':
+                if (isprint (optopt))
+                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+                else
+                    fprintf (stderr,
+                            "Unknown option character `\\x%x'.\n",
+                            optopt);
+                return EXIT_FAILURE;
+            default:
+                HELP_MSG;
+        }
+    }
+    if ((numberOfWorkgroups == 0) && (!optPrintDomains))
+    {
+        fprintf(stderr, "Error: At least one workgroup (-w) must be set on commandline\n");
+        exit (EXIT_FAILURE);
+    }
+
+    if (topology_init() != EXIT_SUCCESS)
+    {
+        fprintf(stderr, "Error: Unsupported processor!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if ((test == NULL) && (!optPrintDomains))
+    {
+        fprintf(stderr, "Unknown test case. Please check likwid-bench -a for available tests\n");
+        fprintf(stderr, "and select one using the -t commandline option\n");
+        exit(EXIT_FAILURE);
+    }
+
+    numa_init();
+    affinity_init();
+    timer_init();
+
+    if (optPrintDomains)
+    {
+        bdestroy(testcase);
+        AffinityDomains_t affinity = get_affinityDomains();
+        ownprintf("Number of Domains %d\n",affinity->numberOfAffinityDomains);
+        for (i=0; i < affinity->numberOfAffinityDomains; i++ )
+        {
+            ownprintf("Domain %d:\n",i);
+            ownprintf("\tTag %s:",bdata(affinity->domains[i].tag));
+
+            for ( uint32_t j=0; j < affinity->domains[i].numberOfProcessors; j++ )
+            {
+                ownprintf(" %d",affinity->domains[i].processorList[j]);
+            }
+            ownprintf("\n");
+        }
+        exit (EXIT_SUCCESS);
+    }
+
+    allocator_init(numberOfWorkgroups * MAX_STREAMS);
+    groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
+    tmp = 0;
+
+    optind = 0;
+    while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1)
+    {
+        switch (c)
+        {
+            case 'w':
+                currentWorkgroup = groups+tmp;
+                bstring groupstr = bfromcstr(optarg);
+                i = bstr_to_workgroup(currentWorkgroup, groupstr, test->type, test->streams);
+                bdestroy(groupstr);
+                if (i == 0)
+                {
+                    for (i=0; i<  test->streams; i++)
+                    {
+                        if (currentWorkgroup->streams[i].offset%test->stride)
+                        {
+                            fprintf (stderr, "Error: Stream %d: offset is not a multiple of stride!\n",i);
+                            return EXIT_FAILURE;
+                        }
+                        allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
+                                PAGE_ALIGNMENT,
+                                currentWorkgroup->size,
+                                currentWorkgroup->streams[i].offset,
+                                test->type,
+                                currentWorkgroup->streams[i].domain);
+                    }
+                    tmp++;
+                }
+                else
+                {
+                    exit(EXIT_FAILURE);
+                }
+                break;
+            default:
+                continue;
+                break;
+        }
+    }
+
+    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
+     * module only allows equally sized thread groups*/
+    for (i=0; i<numberOfWorkgroups; i++)
+    {
+        globalNumberOfThreads += groups[i].numberOfThreads;
+    }
+
+    ownprintf(bdata(HLINE));
+    ownprintf("LIKWID MICRO BENCHMARK\n");
+    ownprintf("Test: %s\n",test->name);
+    ownprintf(bdata(HLINE));
+    ownprintf("Using %" PRIu64 " work groups\n",numberOfWorkgroups);
+    ownprintf("Using %d threads\n",globalNumberOfThreads);
+    ownprintf(bdata(HLINE));
+
+
+    threads_init(globalNumberOfThreads);
+    threads_createGroups(numberOfWorkgroups);
+
+    /* we configure global barriers only */
+    barrier_init(1);
+    barrier_registerGroup(globalNumberOfThreads);
+    cyclesClock = timer_getCycleClock();
+
+#ifdef LIKWID_PERFMON
+    if (getenv("LIKWID_FILEPATH") != NULL)
+    {
+        ownprintf("Using Likwid Marker API\n");
+    }
+    LIKWID_MARKER_INIT;
+    ownprintf(bdata(HLINE));
+#endif
+
+
+    /* initialize data structures for threads */
+    for (i=0; i<numberOfWorkgroups; i++)
+    {
+        myData.iter = iter;
+        if (demandIter > 0)
+        {
+            myData.iter = demandIter;
+        }
+        myData.min_runtime = min_runtime;
+        myData.size = groups[i].size;
+        myData.test = test;
+        myData.cycles = 0;
+        myData.numberOfThreads = groups[i].numberOfThreads;
+        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
+        myData.streams = (void**) malloc(test->streams * sizeof(void*));
+
+        for (j=0; j<groups[i].numberOfThreads; j++)
+        {
+            myData.processors[j] = groups[i].processorIds[j];
+        }
+
+        for (j=0; j<  test->streams; j++)
+        {
+            myData.streams[j] = groups[i].streams[j].ptr;
+        }
+
+        threads_registerDataGroup(i, &myData, copyThreadData);
+
+        free(myData.processors);
+        free(myData.streams);
+    }
+
+    if (demandIter == 0)
+    {
+        getIterSingle((void*) &threads_data[0]);
+        for (i=0; i<numberOfWorkgroups; i++)
+        {
+            iter = threads_updateIterations(i, demandIter);
+        }
+    }
+#ifdef DEBUG_LIKWID
+    else
+    {
+        ownprintf("Using manually selected iterations per thread\n");
+    }
+#endif
+
+    threads_create(runTest);
+    threads_join();
+
+    for (int i=0; i<globalNumberOfThreads; i++)
+    {
+        realSize += threads_data[i].data.size;
+        realIter += threads_data[i].data.iter;
+        if (threads_data[i].cycles > maxCycles)
+        {
+            maxCycles = threads_data[i].cycles;
+        }
+        if (threads_data[i].cycles < minCycles)
+        {
+            minCycles = threads_data[i].cycles;
+        }
+    }
+
+
+
+    time = (double) maxCycles / (double) cyclesClock;
+    ownprintf(bdata(HLINE));
+    ownprintf("Cycles:\t\t\t%" PRIu64 "\n", maxCycles);
+    ownprintf("CPU Clock:\t\t%" PRIu64 "\n", timer_getCpuClock());
+    ownprintf("Cycle Clock:\t\t%" PRIu64 "\n", cyclesClock);
+    ownprintf("Time:\t\t\t%e sec\n", time);
+    ownprintf("Iterations:\t\t%" PRIu64 "\n", realIter);
+    ownprintf("Iterations per thread:\t%" PRIu64 "\n",threads_data[0].data.iter);
+    ownprintf("Inner loop executions:\t%.0f\n", ((double)realSize)/((double)test->stride));
+    ownprintf("Size:\t\t\t%" PRIu64 "\n",  realSize*test->bytes );
+    ownprintf("Size per thread:\t%" PRIu64 "\n", threads_data[0].data.size*test->bytes);
+    ownprintf("Number of Flops:\t%" PRIu64 "\n", (threads_data[0].data.iter * realSize *  test->flops));
+    ownprintf("MFlops/s:\t\t%.2f\n",
+            1.0E-06 * ((double) threads_data[0].data.iter * realSize *  test->flops/  time));
+    
+    ownprintf("Data volume (Byte):\t%llu\n", LLU_CAST (threads_data[0].data.iter * realSize *  test->bytes));
+    ownprintf("MByte/s:\t\t%.2f\n",
+            1.0E-06 * ( (double) threads_data[0].data.iter * realSize *  test->bytes/ time));
+
+    cycPerUp = ((double) maxCycles / (double) (threads_data[0].data.iter * realSize));
+    ownprintf("Cycles per update:\t%f\n", cycPerUp);
+
+    switch ( test->type )
+    {
+        case INT:
+        case SINGLE:
+            ownprintf("Cycles per cacheline:\t%f\n", (16.0 * cycPerUp));
+            break;
+        case DOUBLE:
+            ownprintf("Cycles per cacheline:\t%f\n", (8.0 * cycPerUp));
+            break;
+    }
+    ownprintf("Loads per update:\t%ld\n", test->loads );
+    ownprintf("Stores per update:\t%ld\n", test->stores );
+    if ((test->loads > 0) && (test->stores > 0))
+    {
+        ownprintf("Load/store ratio:\t%.2f\n", ((double)test->loads)/((double)test->stores) );
+    }
+    if ((test->instr_loop > 0) && (test->instr_const > 0))
+    {
+        ownprintf("Instructions:\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->instr_loop*threads_data[0].data.iter + test->instr_const );
+    }
+    if (test->uops > 0)
+    {
+        ownprintf("UOPs:\t\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->uops*threads_data[0].data.iter);
+    }
+
+    ownprintf(bdata(HLINE));
+    threads_destroy(numberOfWorkgroups, test->streams);
+    allocator_finalize();
+    workgroups_destroy(&groups, numberOfWorkgroups, test->streams);
+
+#ifdef LIKWID_PERFMON
+    if (getenv("LIKWID_FILEPATH") != NULL)
+    {
+        ownprintf("Writing Likwid Marker API results to file %s\n", getenv("LIKWID_FILEPATH"));
+    }
+    LIKWID_MARKER_CLOSE;
+#endif
+
+    bdestroy(HLINE);
+    return EXIT_SUCCESS;
+}
+
diff --git a/bench/perl/AsmGen.pl b/bench/perl/AsmGen.pl
new file mode 100755
index 0000000..7fee506
--- /dev/null
+++ b/bench/perl/AsmGen.pl
@@ -0,0 +1,284 @@
+#!/usr/bin/perl -w
+use strict;
+no strict "refs";
+use warnings;
+use lib './perl';
+use Parse::RecDescent;
+use Data::Dumper;
+use Getopt::Std;
+use Cwd 'abs_path';
+
+use gas;
+
+my $ROOT = abs_path('./');
+my $DEBUG=0;
+my $VERBOSE=0;
+our $ISA = 'x86';
+our $AS  = 'gas';
+my $OPT_STRING = 'hpvda:i:o:';
+my %OPT;
+my $INPUTFILE;
+my $OUTPUTFILE;
+my $CPP_ARGS='';
+
+# Enable warnings within the Parse::RecDescent module.
+$::RD_ERRORS = 1; # Make sure the parser dies when it encounters an error
+#$::RD_WARN   = 1; # Enable warnings. This will warn on unused rules &c.
+#$::RD_HINT   = 1; # Give out hints to help fix problems.
+#$::RD_TRACE  = 1;     # if defined, also trace parsers' behaviour
+$::RD_AUTOACTION = q { [@item[0..$#item]] };
+
+sub init
+{
+	getopts( "$OPT_STRING", \%OPT ) or usage();
+	if ($OPT{h}) { usage(); };
+	if ($OPT{v}) { $VERBOSE = 1;}
+	if ($OPT{d}) { $DEBUG = 1;}
+
+	if (! $ARGV[0]) {
+		die "ERROR: Please specify a input file!\n\nCall script with argument -h for help.\n";
+	}
+
+	$INPUTFILE = $ARGV[0];
+	$CPP_ARGS = $ARGV[1] if ($ARGV[1]);
+
+	if ($INPUTFILE =~ /.pas$/) {
+		$INPUTFILE =~ s/\.pas//; 
+	} else {
+		die "ERROR: Input file must have pas ending!\n";
+	}
+	if ($OPT{o}) { 
+		$OUTPUTFILE = $OPT{o};
+	}else {
+		$OUTPUTFILE = "$INPUTFILE.s";
+	}
+	if ($OPT{i}) { 
+		$ISA = $OPT{i};
+		print "INFO: Using isa $ISA.\n\n" if ($VERBOSE);
+	} else {
+		print "INFO: No isa specified.\n Using default $ISA.\n\n" if ($VERBOSE);
+	}
+	if ($OPT{a}) { 
+		$AS = $OPT{a};
+		print "INFO: Using as $AS.\n\n" if ($VERBOSE);
+	} else {
+		print "INFO: No as specified.\n Using default $AS.\n\n" if ($VERBOSE);
+	}
+
+  as::isa_init();
+}
+
+sub usage
+{
+    print <<END;
+usage: $0 [-$OPT_STRING]  <INFILE>
+
+Required:
+<INFILE>  : Input pas file
+
+Optional:
+-h        : this (help) message
+-v        : verbose output
+-d        : debug mode: prints out the parse tree
+-p        : Print out intermediate preprocessed output
+-o <FILE> : Output file
+-a <ASM>  : Specify different assembler (Default: gas)
+-i <ISA>  : Specify different isa (Default: x86)
+
+Example: 
+$0 -i x86-64  -a masm -o out.s  myfile.pas
+
+END
+
+exit(0);
+}
+
+#=======================================
+# GRAMMAR
+#=======================================
+$main::grammar = <<'_EOGRAMMAR_';
+# Terminals
+FUNC        : /func/i
+LOOP        : /loop/i
+ALLOCATE    : /allocate/i
+FACTOR      : /factor/i
+DEFINE      : /define/i
+USE         : /use/i
+STOP        : /stop/i
+START       : /start/i
+LOCAL       : /local/i
+TIMER       : /timer/i
+INCREMENT   : /increment/i
+ALIGN       : /align/i
+INT         : /int/i
+SINGLE      : /single/i
+DOUBLE      : /double/i
+INUMBER     : NUMBER
+UNUMBER     : NUMBER
+SNUMBER     : NUMBER
+FNUMBER     : NUMBER
+OFFSET      : /([0-9]+\,){15}[0-9]+/
+NUMBER      : /[-+]?[0-9]*\.?[0-9]+/
+SYMBOL      : /[.A-Z-a-z_][A-Za-z0-9_]*/
+REG         : /GPR[0-9]+/i
+SREG         : /GPR[0-9]+/i
+COMMENT     : /#.*/
+{'skip'}
+
+type: SINGLE
+     |DOUBLE
+     |INT
+
+align: ALIGN <commit> NUMBER
+{
+{FUNC => 'as::align',
+ ARGS => ["$item{NUMBER}[1]"]}
+}
+
+ASMCODE     : /[A-Za-z1-9.:]+.*/
+{
+{FUNC => 'as::emit_code',
+ ARGS => [$item[1]]}
+}
+
+function:  FUNC SYMBOL block
+{[
+ {FUNC => 'as::function_entry',
+  ARGS => [$item{SYMBOL}[1],0]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+  ARGS => [$item{SYMBOL}[1]]}
+]}
+
+function_allocate:  FUNC SYMBOL ALLOCATE NUMBER block
+{[
+ {FUNC => 'as::function_entry',
+  ARGS => [$item{SYMBOL}[1],$item{NUMBER}[1]]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+  ARGS => [$item{SYMBOL}[1]]}
+]}
+
+loop:  LOOP SYMBOL INUMBER SNUMBER block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SNUMBER}[1][1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+| LOOP SYMBOL INUMBER SREG block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+
+timer: START TIMER
+{
+{FUNC => 'isa::start_timer',
+ ARGS => []}
+}
+| STOP TIMER
+{
+{FUNC => 'isa::stop_timer',
+ ARGS => []}
+}
+
+mode:  START LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+| STOP LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+
+block: '{' expression(s) '}'
+{ $item[2] }
+
+define_data: DEFINE type  SYMBOL  OFFSET
+{
+{FUNC => 'as::define_offset',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{OFFSET}[1]"]}
+}
+
+define_data: DEFINE type  SYMBOL  NUMBER
+{
+{FUNC => 'as::define_data',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{NUMBER}[1]"]}
+}
+
+
+expression:  align
+            |COMMENT
+            |loop
+            |timer
+            |mode
+			|ASMCODE
+{ $item[1] }
+
+instruction : define_data
+            | align
+            | COMMENT
+            | mode
+            | function
+            | function_allocate
+{ $item[1] }
+
+startrule: instruction(s)
+{ $item[1] }
+
+_EOGRAMMAR_
+
+
+#=======================================
+# MAIN
+#=======================================
+init();
+print "INFO: Calling cpp with arguments $CPP_ARGS.\n" if ($VERBOSE);
+my $text = `cpp -x assembler-with-cpp $CPP_ARGS $INPUTFILE.pas`;
+
+if ($OPT{p}) {
+	open FILE,">$INPUTFILE.Pas";
+	print FILE $text;
+	close FILE;
+}
+
+open STDOUT,">$OUTPUTFILE";
+print "$as::AS->{HEADER}\n";
+
+my $parser = new Parse::RecDescent ($main::grammar)  or die "ERROR: Bad grammar!\n";
+my $parse_tree = $parser->startrule($text) or print STDERR "ERROR: Syntax Error\n";
+tree_exec($parse_tree);
+
+if ($DEBUG) {
+	open FILE,'>parse_tree.txt';
+	print FILE Dumper $parse_tree,"\n";
+	close FILE;
+}
+
+print "$as::AS->{FOOTER}\n";
+
+sub tree_exec 
+{
+	my $tree = shift;
+
+	foreach my $node (@$tree) {
+		if ($node !~ /^skip|^instruction|^expression|^loop/) {
+			if (ref($node) eq 'ARRAY')  {
+				tree_exec($node);
+			}else {
+				if (ref($node) eq 'HASH') {
+					&{$node->{FUNC}}(@{$node->{ARGS}});
+				}
+			}
+		}
+	}
+}
+
+
diff --git a/perl/Parse/RecDescent.pm b/bench/perl/Parse/RecDescent.pm
similarity index 100%
rename from perl/Parse/RecDescent.pm
rename to bench/perl/Parse/RecDescent.pm
diff --git a/perl/Template.pm b/bench/perl/Template.pm
similarity index 100%
rename from perl/Template.pm
rename to bench/perl/Template.pm
diff --git a/perl/Template/Base.pm b/bench/perl/Template/Base.pm
similarity index 100%
rename from perl/Template/Base.pm
rename to bench/perl/Template/Base.pm
diff --git a/perl/Template/Config.pm b/bench/perl/Template/Config.pm
similarity index 100%
rename from perl/Template/Config.pm
rename to bench/perl/Template/Config.pm
diff --git a/perl/Template/Constants.pm b/bench/perl/Template/Constants.pm
similarity index 100%
rename from perl/Template/Constants.pm
rename to bench/perl/Template/Constants.pm
diff --git a/perl/Template/Context.pm b/bench/perl/Template/Context.pm
similarity index 100%
rename from perl/Template/Context.pm
rename to bench/perl/Template/Context.pm
diff --git a/perl/Template/Directive.pm b/bench/perl/Template/Directive.pm
similarity index 100%
rename from perl/Template/Directive.pm
rename to bench/perl/Template/Directive.pm
diff --git a/perl/Template/Document.pm b/bench/perl/Template/Document.pm
similarity index 100%
rename from perl/Template/Document.pm
rename to bench/perl/Template/Document.pm
diff --git a/perl/Template/Exception.pm b/bench/perl/Template/Exception.pm
similarity index 100%
rename from perl/Template/Exception.pm
rename to bench/perl/Template/Exception.pm
diff --git a/perl/Template/Filters.pm b/bench/perl/Template/Filters.pm
similarity index 100%
rename from perl/Template/Filters.pm
rename to bench/perl/Template/Filters.pm
diff --git a/perl/Template/Grammar.pm b/bench/perl/Template/Grammar.pm
similarity index 100%
rename from perl/Template/Grammar.pm
rename to bench/perl/Template/Grammar.pm
diff --git a/perl/Template/Iterator.pm b/bench/perl/Template/Iterator.pm
similarity index 100%
rename from perl/Template/Iterator.pm
rename to bench/perl/Template/Iterator.pm
diff --git a/perl/Template/Namespace/Constants.pm b/bench/perl/Template/Namespace/Constants.pm
similarity index 100%
rename from perl/Template/Namespace/Constants.pm
rename to bench/perl/Template/Namespace/Constants.pm
diff --git a/perl/Template/Parser.pm b/bench/perl/Template/Parser.pm
similarity index 100%
rename from perl/Template/Parser.pm
rename to bench/perl/Template/Parser.pm
diff --git a/perl/Template/Plugin.pm b/bench/perl/Template/Plugin.pm
similarity index 100%
rename from perl/Template/Plugin.pm
rename to bench/perl/Template/Plugin.pm
diff --git a/perl/Template/Plugin/Assert.pm b/bench/perl/Template/Plugin/Assert.pm
similarity index 100%
rename from perl/Template/Plugin/Assert.pm
rename to bench/perl/Template/Plugin/Assert.pm
diff --git a/perl/Template/Plugin/CGI.pm b/bench/perl/Template/Plugin/CGI.pm
similarity index 100%
rename from perl/Template/Plugin/CGI.pm
rename to bench/perl/Template/Plugin/CGI.pm
diff --git a/perl/Template/Plugin/Datafile.pm b/bench/perl/Template/Plugin/Datafile.pm
similarity index 100%
rename from perl/Template/Plugin/Datafile.pm
rename to bench/perl/Template/Plugin/Datafile.pm
diff --git a/perl/Template/Plugin/Date.pm b/bench/perl/Template/Plugin/Date.pm
similarity index 100%
rename from perl/Template/Plugin/Date.pm
rename to bench/perl/Template/Plugin/Date.pm
diff --git a/perl/Template/Plugin/Directory.pm b/bench/perl/Template/Plugin/Directory.pm
similarity index 100%
rename from perl/Template/Plugin/Directory.pm
rename to bench/perl/Template/Plugin/Directory.pm
diff --git a/perl/Template/Plugin/Dumper.pm b/bench/perl/Template/Plugin/Dumper.pm
similarity index 100%
rename from perl/Template/Plugin/Dumper.pm
rename to bench/perl/Template/Plugin/Dumper.pm
diff --git a/perl/Template/Plugin/File.pm b/bench/perl/Template/Plugin/File.pm
similarity index 100%
rename from perl/Template/Plugin/File.pm
rename to bench/perl/Template/Plugin/File.pm
diff --git a/perl/Template/Plugin/Filter.pm b/bench/perl/Template/Plugin/Filter.pm
similarity index 100%
rename from perl/Template/Plugin/Filter.pm
rename to bench/perl/Template/Plugin/Filter.pm
diff --git a/perl/Template/Plugin/Format.pm b/bench/perl/Template/Plugin/Format.pm
similarity index 100%
rename from perl/Template/Plugin/Format.pm
rename to bench/perl/Template/Plugin/Format.pm
diff --git a/perl/Template/Plugin/HTML.pm b/bench/perl/Template/Plugin/HTML.pm
similarity index 100%
rename from perl/Template/Plugin/HTML.pm
rename to bench/perl/Template/Plugin/HTML.pm
diff --git a/perl/Template/Plugin/Image.pm b/bench/perl/Template/Plugin/Image.pm
similarity index 100%
rename from perl/Template/Plugin/Image.pm
rename to bench/perl/Template/Plugin/Image.pm
diff --git a/perl/Template/Plugin/Iterator.pm b/bench/perl/Template/Plugin/Iterator.pm
similarity index 100%
rename from perl/Template/Plugin/Iterator.pm
rename to bench/perl/Template/Plugin/Iterator.pm
diff --git a/perl/Template/Plugin/Math.pm b/bench/perl/Template/Plugin/Math.pm
similarity index 100%
rename from perl/Template/Plugin/Math.pm
rename to bench/perl/Template/Plugin/Math.pm
diff --git a/perl/Template/Plugin/Pod.pm b/bench/perl/Template/Plugin/Pod.pm
similarity index 100%
rename from perl/Template/Plugin/Pod.pm
rename to bench/perl/Template/Plugin/Pod.pm
diff --git a/perl/Template/Plugin/Procedural.pm b/bench/perl/Template/Plugin/Procedural.pm
similarity index 100%
rename from perl/Template/Plugin/Procedural.pm
rename to bench/perl/Template/Plugin/Procedural.pm
diff --git a/perl/Template/Plugin/Scalar.pm b/bench/perl/Template/Plugin/Scalar.pm
similarity index 100%
rename from perl/Template/Plugin/Scalar.pm
rename to bench/perl/Template/Plugin/Scalar.pm
diff --git a/perl/Template/Plugin/String.pm b/bench/perl/Template/Plugin/String.pm
similarity index 100%
rename from perl/Template/Plugin/String.pm
rename to bench/perl/Template/Plugin/String.pm
diff --git a/perl/Template/Plugin/Table.pm b/bench/perl/Template/Plugin/Table.pm
similarity index 100%
rename from perl/Template/Plugin/Table.pm
rename to bench/perl/Template/Plugin/Table.pm
diff --git a/perl/Template/Plugin/URL.pm b/bench/perl/Template/Plugin/URL.pm
similarity index 100%
rename from perl/Template/Plugin/URL.pm
rename to bench/perl/Template/Plugin/URL.pm
diff --git a/perl/Template/Plugin/View.pm b/bench/perl/Template/Plugin/View.pm
similarity index 100%
rename from perl/Template/Plugin/View.pm
rename to bench/perl/Template/Plugin/View.pm
diff --git a/perl/Template/Plugin/Wrap.pm b/bench/perl/Template/Plugin/Wrap.pm
similarity index 100%
rename from perl/Template/Plugin/Wrap.pm
rename to bench/perl/Template/Plugin/Wrap.pm
diff --git a/perl/Template/Plugins.pm b/bench/perl/Template/Plugins.pm
similarity index 100%
rename from perl/Template/Plugins.pm
rename to bench/perl/Template/Plugins.pm
diff --git a/perl/Template/Provider.pm b/bench/perl/Template/Provider.pm
similarity index 100%
rename from perl/Template/Provider.pm
rename to bench/perl/Template/Provider.pm
diff --git a/perl/Template/Service.pm b/bench/perl/Template/Service.pm
similarity index 100%
rename from perl/Template/Service.pm
rename to bench/perl/Template/Service.pm
diff --git a/perl/Template/Stash.pm b/bench/perl/Template/Stash.pm
similarity index 100%
rename from perl/Template/Stash.pm
rename to bench/perl/Template/Stash.pm
diff --git a/perl/Template/Stash/Context.pm b/bench/perl/Template/Stash/Context.pm
similarity index 100%
rename from perl/Template/Stash/Context.pm
rename to bench/perl/Template/Stash/Context.pm
diff --git a/perl/Template/Stash/XS.pm b/bench/perl/Template/Stash/XS.pm
similarity index 100%
rename from perl/Template/Stash/XS.pm
rename to bench/perl/Template/Stash/XS.pm
diff --git a/perl/Template/Test.pm b/bench/perl/Template/Test.pm
similarity index 100%
rename from perl/Template/Test.pm
rename to bench/perl/Template/Test.pm
diff --git a/perl/Template/VMethods.pm b/bench/perl/Template/VMethods.pm
similarity index 100%
rename from perl/Template/VMethods.pm
rename to bench/perl/Template/VMethods.pm
diff --git a/perl/Template/View.pm b/bench/perl/Template/View.pm
similarity index 100%
rename from perl/Template/View.pm
rename to bench/perl/Template/View.pm
diff --git a/bench/perl/gas.pm b/bench/perl/gas.pm
new file mode 100644
index 0000000..c9f3f81
--- /dev/null
+++ b/bench/perl/gas.pm
@@ -0,0 +1,211 @@
+#!/usr/bin/perl 
+
+package as;
+use Data::Dumper;
+use isax86;
+use isax86_64;
+
+$AS = { HEADER     => '.intel_syntax noprefix',
+	    FOOTER     => ''};
+
+$LOCAL = {};
+$MODE = 'GLOBAL';
+
+my $CURRENT_SECTION='NONE';
+my $WORDLENGTH;
+my $STACKPTR;
+my $BASEPTR;
+my $REG;
+my $ARG;
+
+sub emit_code
+{
+	my $code = shift;
+	$code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
+	$code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
+	$code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
+	print "$code\n";
+}
+
+sub align
+{
+	my $number = shift;
+	print ".align $number\n";
+
+}
+
+sub mode
+{
+	$cmd = shift;
+
+	if ($cmd eq 'START') {
+		$MODE = 'LOCAL';
+	} elsif ($cmd eq 'STOP') {
+		$MODE = 'GLOBAL';
+	}
+}
+
+sub function_entry
+{
+	my $symbolname = shift;
+	my $allocate = shift;
+	my $distance;
+
+	foreach ( (0 .. $allocate) ) {
+		$distance =  $_ * $WORDLENGTH;
+		$LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
+	}
+
+	if($CURRENT_SECTION ne 'text') {
+		$CURRENT_SECTION = 'text';
+		print ".text\n";
+	}
+
+	print ".globl $symbolname\n";
+	print ".type $symbolname, \@function\n";
+	print "$symbolname :\n";
+
+	if ($main::ISA eq 'x86') {
+		print "push ebp\n";
+		print "mov ebp, esp\n";
+		$distance = $allocate * $WORDLENGTH;
+		print "sub  esp, $distance\n" if ($allocate);
+		print "push ebx\n";
+		print "push esi\n";
+		print "push edi\n";
+	} elsif ($main::ISA eq 'x86-64') {
+		print "push rbp\n";
+		print "mov rbp, rsp\n";
+		$distance = $allocate * $WORDLENGTH;
+		print "sub  rsp, $distance\n" if ($allocate);
+		print "push rbx\n";
+		print "push r12\n";
+		print "push r13\n";
+		print "push r14\n";
+		print "push r15\n";
+	}
+}
+
+sub function_exit
+{
+	my $symbolname = shift;
+
+	$LOCAL = {};
+
+	if ($main::ISA eq 'x86') {
+		print "pop edi\n";
+		print "pop esi\n";
+		print "pop ebx\n";
+		print "mov  esp, ebp\n";
+		print "pop ebp\n";
+	} elsif ($main::ISA eq 'x86-64') {
+		print "pop r15\n";
+		print "pop r14\n";
+		print "pop r13\n";
+		print "pop r12\n";
+		print "pop rbx\n";
+		print "mov  rsp, rbp\n";
+		print "pop rbp\n";
+	}
+	print "ret\n";
+	print ".size $symbolname, .-$symbolname\n";
+	print "\n";
+}
+
+sub define_data
+{
+	my $symbolname = shift;
+	my $type = shift;
+	my $value = shift;
+
+	if($CURRENT_SECTION ne 'data') {
+		$CURRENT_SECTION = 'data';
+		print ".data\n";
+	}
+	print ".align 64\n";
+	print "$symbolname:\n";
+	if ($type eq 'DOUBLE') {
+		print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
+	} elsif ($type eq 'SINGLE') {
+		print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
+	} elsif ($type eq 'INT') {
+		print ".int $value, $value\n"
+	}
+}
+
+sub define_offset
+{
+	my $symbolname = shift;
+	my $type = shift;
+	my $value = shift;
+
+	if($CURRENT_SECTION ne 'data') {
+		$CURRENT_SECTION = 'data';
+		print ".data\n";
+	}
+	print ".align 16\n";
+	print "$symbolname:\n";
+  print ".int $value\n";
+}
+
+
+sub loop_entry
+{
+  my $symbolname = shift;
+  my $stopping_criterion = shift;
+  $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
+
+  if ($main::ISA eq 'x86') {
+    print "xor   eax, eax\n";
+  } elsif ($main::ISA eq 'x86-64') {
+    print "xor   rax, rax\n";
+  }
+  print ".align 16\n";
+  if ($MODE eq 'GLOBAL') {
+    print "$symbolname :\n";
+  }else {
+    print "1:\n";
+  }
+
+}
+
+
+sub loop_exit
+{
+  my $symbolname = shift;
+  my $step = shift;
+
+  if ($main::ISA eq 'x86') {
+    print "add eax, $step\n";
+    print "cmp eax, edi\n";
+  } elsif ($main::ISA eq 'x86-64') {
+    print "addq rax, $step\n";
+    print "cmpq rax, rdi\n";
+  }
+  if ($MODE eq 'GLOBAL') {
+    print "jl $symbolname\n";
+  }else {
+    print "jl 1b\n";
+  }
+  print "\n";
+}
+
+sub isa_init
+{
+  if ($main::ISA eq 'x86') {
+    $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
+    $STACKPTR = $isax86::STACKPTR_X86 ;
+    $BASEPTR = $isax86::BASEPTR_X86 ;
+    $REG = $isax86::REG_X86;
+    $ARG = $isax86::ARG_X86 ;
+  } elsif ($main::ISA eq 'x86-64') {
+    $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
+    $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
+    $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
+    $REG = $isax86_64::REG_X86_64;
+    $ARG = $isax86_64::ARG_X86_64 ;
+  }
+}
+
+
+1;
diff --git a/bench/perl/generatePas.pl b/bench/perl/generatePas.pl
new file mode 100755
index 0000000..2dcd530
--- /dev/null
+++ b/bench/perl/generatePas.pl
@@ -0,0 +1,198 @@
+#!/usr/bin/perl
+
+use lib 'util';
+use strict;
+use warnings;
+use lib './perl';
+use File::Copy;
+use Cwd 'abs_path';
+use Data::Dumper;
+use Template;
+
+my @Testcases;
+my $name;
+my $streams;
+my $type;
+my $flops;
+my $bytes;
+my $desc;
+my $prolog='';
+my $loop='';
+my $increment;
+my $isLoop=0;
+my $skip=0;
+my $multi=0;
+
+my $BenchRoot = $ARGV[0];
+my $OutputDirectory = $ARGV[1];
+my $TemplateRoot = $ARGV[2];
+my $InputFile = "";
+if (@ARGV == 4)
+{
+    $InputFile = $ARGV[3];
+}
+my $DEBUG = 0;
+
+my $stream_lookup = {
+    STR0 => 'ARG2',
+    STR1 => 'ARG3',
+    STR2 => 'ARG4',
+    STR3 => 'ARG5',
+    STR4 => 'ARG6',
+    STR5 =>  '[rbp+16]',
+    STR6 =>  '[rbp+24]',
+    STR7 =>  '[rbp+32]',
+    STR8 =>  '[rbp+40]',
+    STR9 => '[rbp+48]',
+    STR10 => '[rbp+56]',
+    STR11 => '[rbp+64]',
+    STR12 => '[rbp+72]',
+    STR13 => '[rbp+80]',
+    STR14 => '[rbp+88]',
+    STR15 => '[rbp+96]',
+    STR16 => '[rbp+104]',
+    STR17 => '[rbp+112]',
+    STR18 => '[rbp+120]',
+    STR19 => '[rbp+128]',
+    STR20 => '[rbp+136]',
+    STR21 => '[rbp+144]',
+    STR22 => '[rbp+152]',
+    STR23 => '[rbp+160]',
+    STR24 => '[rbp+168]',
+    STR25 => '[rbp+176]',
+    STR26 => '[rbp+184]',
+    STR27 => '[rbp+192]',
+    STR28 => '[rbp+200]',
+    STR29 => '[rbp+208]',
+    STR30 => '[rbp+216]',
+    STR31 => '[rbp+224]',
+    STR32 => '[rbp+232]',
+    STR33 => '[rbp+240]',
+    STR34 => '[rbp+248]',
+    STR35 => '[rbp+256]',
+    STR36 => '[rbp+264]',
+    STR37 => '[rbp+272]',
+    STR38 => '[rbp+280]',
+    STR39 => '[rbp+288]',
+    STR40 => '[rbp+296]'};
+
+opendir (DIR, "./$BenchRoot") or die "Cannot open bench directory: $!\n";
+my $tpl = Template->new({
+        INCLUDE_PATH => ["$TemplateRoot"]
+        });
+
+while (defined(my $file = readdir(DIR))) {
+    if ($file !~ /^\./) {
+        print "SCANNING $file\n" if ($DEBUG);
+
+        $file =~ /([A-Za-z_0-9]+)\.ptt/;
+        $name = $1;
+
+        $isLoop = 0;
+        $skip=0;
+        $multi=0;
+        $prolog='';
+        $loop='';
+        $desc='';
+        my $loads=-1;
+        my $stores=-1;
+        my $branches=-1;
+        my $instr=-1;
+        my $loop_instr=-1;
+        my $uops = -1;
+        open FILE, "<$BenchRoot/$file";
+        while (<FILE>) {
+            my $line = $_;
+
+            if($line =~ /STREAMS[ ]+([0-9]+)/) {
+                $streams = $1;
+                if ($streams > 10) {
+                    $multi = 1;
+                }
+            } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE|INT)/) {
+                $type = $1;
+            } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) {
+                $flops = $1;
+            } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
+                $bytes = $1;
+            } elsif ($line =~ /LOADS[ ]+([0-9]+)/) {
+                $loads = $1;
+            } elsif ($line =~ /STORES[ ]+([0-9]+)/) {
+                $stores = $1;
+            } elsif ($line =~ /BRANCHES[ ]+([0-9]+)/) {
+                $branches = $1;
+            } elsif ($line =~ /INSTR_CONST[ ]+([0-9]+)/) {
+                $instr = $1;
+            } elsif ($line =~ /INSTR_LOOP[ ]+([0-9]+)/) {
+                $loop_instr = $1;
+            } elsif ($line =~ /UOPS[ ]+([0-9]+)/) {
+                $uops = $1;
+            } elsif ($line =~ /DESC[ ]+([a-zA-z ,.\-_\(\)\+\*\/=]+)/) {
+                $desc = $1;
+            } elsif ($line =~ /INC[ ]+([0-9]+)/) {
+                $increment = $1;
+                $skip = 1;
+            } elsif ($line =~ /LOOP[ ]+([0-9]+)/) {
+                $increment = $1;
+                $isLoop = 1;
+            } else {
+                if ($isLoop) {
+                    if($line =~ /SET[ ]+(STR[0-9]+)[ ]+(GPR[0-9]+)/) {
+                        $loop .= "#define $1  $2\n";
+                        $loop .= "mov $2, $stream_lookup->{$1}\n";
+                    } else {
+                        $loop .= $line;
+                    }
+                } else {
+                    $prolog .= $line;
+                }
+            }
+        }
+        close FILE;
+
+        if (($streams > 5) &&  ($streams < 10)) {
+            my $arg = 7;
+            foreach my $stream ( 5 .. $streams ) {
+                $prolog .= "mov STR$stream, ARG$arg\n";
+                $arg++;
+            }
+        }
+
+        $streams = 'STREAM_'.$streams;
+        my $Vars;
+        $Vars->{name} = $name;
+        $Vars->{prolog} = $prolog;
+        $Vars->{increment} = $increment;
+        $Vars->{loop} = $loop;
+        $Vars->{skip} = $skip;
+        $Vars->{multi} = $multi;
+        $Vars->{desc} = $desc;
+
+#print Dumper($Vars);
+
+        $tpl->process('bench.tt', $Vars, "$OutputDirectory/$name.pas");
+        push(@Testcases,{name    => $name,
+                         streams => $streams,
+                         type    => $type,
+                         stride  => $increment,
+                         flops   => $flops,
+                         bytes   => $bytes,
+                         desc    => $desc,
+                         loads    => $loads,
+                         stores    => $stores,
+                         branches    => $branches,
+                         instr_const    => $instr,
+                         instr_loop    => $loop_instr,
+                         uops    => $uops});
+    }
+}
+#print Dumper(@Testcases);
+my @TestcasesSorted = sort {$a->{name} cmp $b->{name}} @Testcases;
+
+my $Vars;
+$Vars->{Testcases} = \@TestcasesSorted;
+$Vars->{numKernels} = $#TestcasesSorted+1;
+$Vars->{allTests} = join('\n',map {$_->{name}." - ".$_->{desc}} @TestcasesSorted);
+$tpl->process('testcases.tt', $Vars, "$OutputDirectory/testcases.h");
+
+
diff --git a/perl/isax86.pm b/bench/perl/isax86.pm
similarity index 100%
rename from perl/isax86.pm
rename to bench/perl/isax86.pm
diff --git a/perl/isax86_64.pm b/bench/perl/isax86_64.pm
similarity index 100%
rename from perl/isax86_64.pm
rename to bench/perl/isax86_64.pm
diff --git a/perl/templates/bench.tt b/bench/perl/templates/bench.tt
similarity index 100%
rename from perl/templates/bench.tt
rename to bench/perl/templates/bench.tt
diff --git a/bench/perl/templates/group.tt b/bench/perl/templates/group.tt
new file mode 100644
index 0000000..5676318
--- /dev/null
+++ b/bench/perl/templates/group.tt
@@ -0,0 +1,157 @@
+/* GENERATED FILE: DO NOTE EDIT */
+
+#define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
+
+static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]"},
+[% END %]
+};
+
+/*void
+perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
+{
+    int threadId;
+    double time = rdtscTime;
+    double inverseClock = 1.0 /(double) timer_getCpuClock();
+    PerfmonResultTable tableData;
+    int numRows;
+    int numColumns = perfmon_numThreads;
+    bstring label;
+    bstrList* fc;
+    double** stat;
+    double tmpValue;
+    uint64_t cpi_instr = 0;
+    uint64_t cpi_cyc  = 0;
+    int cpi_index = 0;
+
+    switch ( group ) 
+    {
+[% FOREACH group IN groups %]
+        case [% group.name %]:
+            numRows = [% group.numRows %];
+            stat = (double**) malloc(numRows * sizeof(double*));
+            for (int i=0; i<numRows; i++)
+            {
+                stat[i] = (double*) malloc(4 * sizeof(double));
+                stat[i][0] = 0;
+                stat[i][1] = 0;
+                stat[i][2] = DBL_MAX;
+            }
+            INIT_BASIC;
+[% FOREACH metric IN group.metrics %]
+            bstrListAdd(fc,[% loop.count %],[% metric.label %]);
+[% END %]
+            initResultTable(&tableData, fc, numRows, numColumns);
+
+            for(threadId=0; threadId < perfmon_numThreads; threadId++)
+            {
+[% FOREACH metric IN group.metrics %]
+                tmpValue = [% metric.rule %];
+                if (!isnan(tmpValue))
+                {
+                    tableData.rows[[% loop.index %]].value[threadId] = tmpValue;
+                }
+                else
+                {
+                    tableData.rows[[% loop.index %]].value[threadId] = 0.0;
+                }
+[% IF metric.label == 'CPI' && arch == 'westmere' %]
+                cpi_instr += perfmon_getResult(threadId,"FIXC0");
+                cpi_cyc += perfmon_getResult(threadId,"FIXC1");
+                cpi_index = [% loop.index %];
+[% ELSE %]
+                stat[[% loop.index %]][0] += (double) tableData.rows[[% loop.index %]].value[threadId];
+[% END %]
+                stat[[% loop.index %]][1] =  MAX(stat[[% loop.index %]][1],(double) tableData.rows[[% loop.index %]].value[threadId]);
+                stat[[% loop.index %]][2] =  MIN(stat[[% loop.index %]][2],(double) tableData.rows[[% loop.index %]].value[threadId]);
+[% END %]
+            }
+
+            if (cpi_instr)
+            {
+                stat[cpi_index][0] = (double) cpi_cyc / (double) cpi_instr;
+            }
+                
+            break;
+[% END %]
+
+        default:
+            fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+            exit (EXIT_FAILURE);
+            break;
+    }
+
+    printResultTable(&tableData);
+    freeResultTable(&tableData);
+
+    // for threaded results print sum, max, min and avg 
+    if (perfmon_numThreads > 1)
+    {
+        initStatisticTable(&tableData, fc, numRows);
+        for (int i=0; i<numRows; i++)
+        {
+            stat[i][3] =  stat[i][0]/perfmon_numThreads;
+            for (int j=0; j<4; j++)
+            {
+                tableData.rows[i].value[j] = stat[i][j];
+            }
+        }
+        printResultTable(&tableData);
+        freeResultTable(&tableData);
+    }
+
+    for (int i=0; i<numRows; i++)
+    {
+        free(stat[i]);
+    }
+    free(stat);
+    bstrListDestroy(fc);
+}
+
+void
+perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double time,double timeStamp)
+{
+    int threadId;
+    double tmpValue;
+    double inverseClock = 1.0 /(double) timer_getCpuClock();
+
+    switch ( group ) 
+    {
+        [% FOREACH group IN groups %]
+        case [% group.name %]:
+
+                    [% FOREACH metric IN group.metrics %]
+                        printf("[% metric.label %] %e ",timeStamp);
+                        for(threadId=0; threadId < perfmon_numThreads; threadId++)
+                        {
+                            tmpValue = [% metric.rule %];
+                            if (!isnan(tmpValue))
+                            {
+                                printf(" %e  ", tmpValue);
+                            }
+                            else
+                            {
+                                printf(" 0.0  ");
+                            }
+                        }
+                        printf("\n");
+                    [% END %]
+            break;
+            [% END %]
+
+        default:
+                fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+                exit (EXIT_FAILURE);
+                break;
+    }
+}*/
+
+
+
+static PerfmonGroupHelp [% arch %]_group_help[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+    {"[% group.name %]","[% group.longHelp %]"},
+[% END %]
+};
+
diff --git a/perl/templates/group_types.tt b/bench/perl/templates/group_types.tt
similarity index 100%
rename from perl/templates/group_types.tt
rename to bench/perl/templates/group_types.tt
diff --git a/bench/perl/templates/testcases.tt b/bench/perl/templates/testcases.tt
new file mode 100644
index 0000000..ceaa23b
--- /dev/null
+++ b/bench/perl/templates/testcases.tt
@@ -0,0 +1,19 @@
+#ifndef TESTCASES_H
+#define TESTCASES_H
+
+#include <test_types.h>
+
+[% FOREACH test IN Testcases %]
+extern void [% test.name %]();
+[% END %]
+
+#define TESTS  "[% allTests %]"
+#define NUMKERNELS [% numKernels %]
+
+static const TestCase kernels[NUMKERNELS] = {
+    [% FOREACH test IN Testcases %]
+    {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %], "[% test.desc %]", [% test.loads %], [% test.stores %], [% test.branches %], [% test.instr_const %], [% test.instr_loop %], [% test.uops %]},
+    [% END %]
+};
+
+#endif /* TESTCASES_H */
diff --git a/bench/phi/store.ptt b/bench/phi/store.ptt
index 533501c..3aa5bd2 100644
--- a/bench/phi/store.ptt
+++ b/bench/phi/store.ptt
@@ -2,10 +2,10 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-vmovaps zmm0, [SCALAR]
-vmovaps zmm1, [SCALAR]
-vmovaps zmm2, [SCALAR]
-vmovaps zmm3, [SCALAR]
+vmovaps zmm0, [rip+SCALAR]
+vmovaps zmm1, [rip+SCALAR]
+vmovaps zmm2, [rip+SCALAR]
+vmovaps zmm3, [rip+SCALAR]
 LOOP 32
 vprefetch0 [STR0 + GPR1 * 8 + 1024]
 vmovaps    [STR0 + GPR1 * 8]     , zmm0
diff --git a/bench/phi/store_mem.ptt b/bench/phi/store_mem.ptt
index fa8d262..0aeccd6 100644
--- a/bench/phi/store_mem.ptt
+++ b/bench/phi/store_mem.ptt
@@ -2,10 +2,10 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-vmovaps zmm0, [SCALAR]
-vmovaps zmm1, [SCALAR]
-vmovaps zmm2, [SCALAR]
-vmovaps zmm3, [SCALAR]
+vmovaps zmm0, [rip+SCALAR]
+vmovaps zmm1, [rip+SCALAR]
+vmovaps zmm2, [rip+SCALAR]
+vmovaps zmm3, [rip+SCALAR]
 LOOP 32
 vprefetch0 [STR0 + GPR1 * 8 + 1024]
 vmovnrngoaps    [STR0 + GPR1 * 8], zmm0
diff --git a/bench/src/allocator.c b/bench/src/allocator.c
new file mode 100644
index 0000000..ea0be48
--- /dev/null
+++ b/bench/src/allocator.c
@@ -0,0 +1,209 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  allocator.c
+ *
+ *      Description:  Implementation of allocator module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <allocator_types.h>
+#include <allocator.h>
+#include <likwid.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static int numberOfAllocatedVectors = 0;
+static allocation* allocList;
+static AffinityDomains_t domains = NULL;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void
+allocator_init(int numVectors)
+{
+    allocList = (allocation*) malloc(numVectors * sizeof(allocation));
+    domains = get_affinityDomains();
+}
+
+
+void
+allocator_finalize()
+{
+    int i;
+
+    for (i=0; i<numberOfAllocatedVectors; i++)
+    {
+        free(allocList[i].ptr);
+        allocList[i].ptr = NULL;
+        allocList[i].size = 0;
+        allocList[i].offset = 0;
+    }
+    numberOfAllocatedVectors = 0;
+}
+
+void
+allocator_allocateVector(
+        void** ptr,
+        int alignment,
+        uint64_t size,
+        int offset,
+        DataType type,
+        bstring domainString)
+{
+    int i;
+    size_t bytesize = 0;
+    const AffinityDomain* domain = NULL;
+    int errorCode;
+    int elements = 0;
+
+    switch ( type )
+    {
+        case INT:
+            bytesize = (size+offset) * sizeof(int);
+            elements = alignment / sizeof(int);
+            break;
+
+        case SINGLE:
+            bytesize = (size+offset) * sizeof(float);
+            elements = alignment / sizeof(float);
+            break;
+
+        case DOUBLE:
+            bytesize = (size+offset) * sizeof(double);
+            elements = alignment / sizeof(double);
+            break;
+    }
+
+    for (i=0;i<domains->numberOfAffinityDomains;i++)
+    {
+        if (biseq(domainString, domains->domains[i].tag))
+        {
+            domain = domains->domains + i;
+        }
+    }
+    if (!domain)
+    {
+        fprintf(stderr, "Error: Cannot use desired domain %s for vector placement, Domain %s does not exist.\n",
+                        bdata(domainString), bdata(domainString));
+        exit(EXIT_FAILURE);
+    }
+
+    errorCode =  posix_memalign(ptr, alignment, bytesize);
+
+    if (errorCode)
+    {
+        if (errorCode == EINVAL)
+        {
+            fprintf(stderr,
+                    "Error: Alignment parameter is not a power of two\n");
+            exit(EXIT_FAILURE);
+        }
+        if (errorCode == ENOMEM)
+        {
+            fprintf(stderr,
+                    "Error: Insufficient memory to fulfill the request\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if ((*ptr) == NULL)
+    {
+        fprintf(stderr, "Error: posix_memalign failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    allocList[numberOfAllocatedVectors].ptr = *ptr;
+    allocList[numberOfAllocatedVectors].size = bytesize;
+    allocList[numberOfAllocatedVectors].offset = offset;
+    allocList[numberOfAllocatedVectors].type = type;
+    numberOfAllocatedVectors++;
+
+    affinity_pinProcess(domain->processorList[0]);
+    printf("Allocate: Process running on core %d (Domain %s) - Vector length %llu Offset %d Alignment %llu\n",
+            affinity_processGetProcessorId(),
+            bdata(domain->tag),
+            LLU_CAST bytesize,
+            offset,
+            LLU_CAST elements);
+
+    switch ( type )
+    {
+        case INT:
+            {
+                int* sptr = (int*) (*ptr);
+                sptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    sptr[i] = 1;
+                }
+                *ptr = (void*) sptr;
+
+            }
+            break;
+
+        case SINGLE:
+            {
+                float* sptr = (float*) (*ptr);
+                sptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    sptr[i] = 1.0;
+                }
+                *ptr = (void*) sptr;
+
+            }
+            break;
+
+        case DOUBLE:
+            {
+                double* dptr = (double*) (*ptr);
+                dptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    dptr[i] = 1.0;
+                }
+                *ptr = (void*) dptr;
+            }
+            break;
+    }
+}
+
diff --git a/bench/src/barrier.c b/bench/src/barrier.c
new file mode 100644
index 0000000..4b0e344
--- /dev/null
+++ b/bench/src/barrier.c
@@ -0,0 +1,167 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier.c
+ *
+ *      Description:  Implementation of threaded spin loop barrier
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <errno.h>
+#include <barrier.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define CACHELINE_SIZE 64
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static BarrierGroup* groups;
+static int currentGroupId = 0;
+static int maxGroupId = 0;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int
+barrier_registerGroup(int numThreads)
+{
+    int ret;
+
+    if (currentGroupId > maxGroupId)
+    {
+        fprintf(stderr, "ERROR: Group ID %d larger than maxGroupID %d\n",currentGroupId,maxGroupId);
+    }
+
+    groups[currentGroupId].numberOfThreads = numThreads;
+    ret = posix_memalign(
+            (void**) &groups[currentGroupId].groupBval,
+            CACHELINE_SIZE, 
+            numThreads * 32 * sizeof(int));
+
+    if (ret < 0)
+    {
+        fprintf(stderr, "ERROR: Cannot register thread group - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+
+    return currentGroupId++;
+}
+
+void
+barrier_registerThread(BarrierData* barr, int groupId, int threadId)
+{
+    int ret;
+    int i;
+    int j = 1;
+    if (groupId > currentGroupId)
+    {
+        fprintf(stderr, "ERROR: Group not yet registered");
+    }
+    if (threadId > groups[groupId].numberOfThreads)
+    {
+        fprintf(stderr, "ERROR: Thread ID %d too large\n",threadId);
+    }
+
+    barr->numberOfThreads = groups[groupId].numberOfThreads;
+    barr->offset = 0;
+    barr->val = 1;
+    barr->bval =  groups[groupId].groupBval;
+    ret = posix_memalign(
+            (void**) &(barr->index),
+            CACHELINE_SIZE, 
+            barr->numberOfThreads * sizeof(int));
+
+    if (ret < 0)
+    {
+        fprintf(stderr, "ERROR: Cannot register thread - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+
+    barr->index[0] = threadId;
+
+    for (i = 0; i < barr->numberOfThreads; i++)
+    {
+        if (!(i == threadId))
+        {
+            barr->index[j++] = i;
+        }
+    }
+}
+
+
+void
+barrier_init(int numberOfGroups) 
+{
+    maxGroupId = numberOfGroups-1;
+    groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
+    if (!groups)
+    {
+        fprintf(stderr, "ERROR: Cannot allocate barrier - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void
+barrier_synchronize(BarrierData* barr)
+{
+    int i;
+
+    barr->bval[barr->index[0] * 32 +  barr->offset * 16] = barr->val;
+
+    for (i = 1; i < barr->numberOfThreads; i++)
+    {
+        while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
+        {
+            __asm__ ("pause");
+        }
+    }
+
+    if (barr->offset)
+    {
+        barr->val = !barr->val;
+    }
+    barr->offset = !barr->offset;
+}
+
+void barrier_destroy(BarrierData* barr)
+{
+    if (currentGroupId > maxGroupId)
+    {
+        fprintf(stderr, "ERROR: Group ID %d larger than maxGroupID %d\n",currentGroupId,maxGroupId);
+    }
+    free(barr->index);
+    free(groups[currentGroupId].groupBval);
+}
diff --git a/bench/src/bench.c b/bench/src/bench.c
new file mode 100644
index 0000000..e1e1a97
--- /dev/null
+++ b/bench/src/bench.c
@@ -0,0 +1,770 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  bench.c
+ *
+ *      Description:  Benchmarking framework for likwid-bench
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *               Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include <allocator.h>
+#include <threads.h>
+#include <barrier.h>
+#include <likwid.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define BARRIER   barrier_synchronize(&barr)
+
+
+#define EXECUTE(func)   \
+    BARRIER; \
+    LIKWID_MARKER_START("bench");  \
+    timer_start(&time); \
+    for (i=0; i<myData->iter; i++) \
+    {   \
+        func; \
+    } \
+    BARRIER; \
+    timer_stop(&time); \
+    LIKWID_MARKER_STOP("bench");  \
+    data->cycles = timer_printCycles(&time); \
+    BARRIER
+
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void* runTest(void* arg)
+{
+    int threadId;
+    int offset;
+    size_t size;
+    size_t vecsize;
+    size_t i;
+    BarrierData barr;
+    ThreadData* data;
+    ThreadUserData* myData;
+    TimerData time;
+    FuncPrototype func;
+
+    data = (ThreadData*) arg;
+    myData = &(data->data);
+    func = myData->test->kernel;
+    threadId = data->threadId;
+    barrier_registerThread(&barr, 0, data->globalThreadId);
+
+    /* Prepare ptrs for thread */
+    vecsize = myData->size;
+    size = myData->size / data->numberOfThreads;
+    myData->size = size;
+    size -= (size % myData->test->stride);
+    offset = data->threadId * size;
+    
+
+    switch ( myData->test->type )
+    {
+        case SINGLE:
+            {
+                float* sptr;
+                for (i=0; i <  myData->test->streams; i++)
+                {
+                    sptr = (float*) myData->streams[i];
+                    sptr +=  offset;
+                    myData->streams[i] = (float*) sptr;
+                }
+            }
+            break;
+        case INT:
+            {
+                int* sptr;
+                for (i=0; i <  myData->test->streams; i++)
+                {
+                    sptr = (int*) myData->streams[i];
+                    sptr +=  offset;
+                    myData->streams[i] = (int*) sptr;
+                }
+            }
+            break;
+        case DOUBLE:
+            {
+                double* dptr;
+                for (i=0; i <  myData->test->streams; i++)
+                {
+                    dptr = (double*) myData->streams[i];
+                    dptr +=  offset;
+                    myData->streams[i] = (double*) dptr;
+                }
+            }
+            break;
+    }
+
+
+    /* pin the thread */
+    likwid_pinThread(myData->processors[threadId]);
+    printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
+            data->groupId,
+            threadId,
+            data->globalThreadId,
+            affinity_threadGetProcessorId(),
+            LLU_CAST vecsize,
+            offset);
+    BARRIER;
+
+    /* Up to 10 streams the following registers are used for Array ptr:
+     * Size rdi
+     * in Registers: rsi  rdx  rcx  r8  r9
+     * passed on stack, then: r10  r11  r12  r13  r14  r15
+     * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
+     * load them from stack
+     * */
+
+    switch ( myData->test->streams ) {
+        case STREAM_1:
+            EXECUTE(func(size,myData->streams[0]));
+            break;
+        case STREAM_2:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1]));
+            break;
+        case STREAM_3:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+            break;
+        case STREAM_4:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+            break;
+        case STREAM_5:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4]));
+            break;
+        case STREAM_6:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5]));
+            break;
+        case STREAM_7:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6]));
+            break;
+        case STREAM_8:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+            break;
+        case STREAM_9:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8]));
+            break;
+        case STREAM_10:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9]));
+            break;
+        case STREAM_11:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10]));
+            break;
+        case STREAM_12:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+            break;
+        case STREAM_13:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12]));
+            break;
+        case STREAM_14:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13]));
+            break;
+        case STREAM_15:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14]));
+            break;
+        case STREAM_16:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+            break;
+        case STREAM_17:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16]));
+            break;
+        case STREAM_18:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17]));
+            break;
+        case STREAM_19:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18]));
+            break;
+        case STREAM_20:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+            break;
+        case STREAM_21:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20]));
+            break;
+        case STREAM_22:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21]));
+            break;
+        case STREAM_23:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22]));
+            break;
+        case STREAM_24:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+            break;
+        case STREAM_25:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24]));
+            break;
+        case STREAM_26:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25]));
+            break;
+        case STREAM_27:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26]));
+            break;
+        case STREAM_28:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+            break;
+        case STREAM_29:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28]));
+            break;
+        case STREAM_30:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29]));
+            break;
+        case STREAM_31:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30]));
+            break;
+        case STREAM_32:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+            break;
+        case STREAM_33:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32]));
+            break;
+        case STREAM_34:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33]));
+            break;
+        case STREAM_35:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34]));
+            break;
+        case STREAM_36:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+            break;
+        case STREAM_37:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36]));
+            break;
+        case STREAM_38:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36],myData->streams[37]));
+            break;
+        default:
+            break;
+    }
+    free(barr.index);
+    pthread_exit(NULL);
+}
+
+
+#define MEASURE(func) \
+    iterations = 8; \
+    while (1) \
+    { \
+        timer_start(&time); \
+        for (i=0;i<iterations;i++) \
+        { \
+            func; \
+        } \
+        timer_stop(&time); \
+        if (timer_print(&time) < (double)data->data.min_runtime) \
+            iterations = iterations << 1; \
+        else \
+            break; \
+    } \
+
+
+void* getIterSingle(void* arg)
+{
+    int threadId = 0;
+    int offset = 0;
+    size_t size = 0;
+    size_t i;
+    ThreadData* data;
+    ThreadUserData* myData;
+    TimerData time;
+    FuncPrototype func;
+    size_t iterations = 0;
+
+    data = (ThreadData*) arg;
+    myData = &(data->data);
+    func = myData->test->kernel;
+    threadId = data->threadId;
+
+    size = myData->size - (myData->size % myData->test->stride);
+    likwid_pinThread(myData->processors[threadId]);
+
+#ifdef DEBUG_LIKWID
+    printf("Automatic iteration count detection:");
+#endif
+
+    switch ( myData->test->streams ) {
+        case STREAM_1:
+            MEASURE(func(size,myData->streams[0]));
+            break;
+        case STREAM_2:
+            MEASURE(func(size,myData->streams[0],myData->streams[1]));
+            break;
+        case STREAM_3:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+            break;
+        case STREAM_4:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+            break;
+        case STREAM_5:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4]));
+            break;
+        case STREAM_6:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5]));
+            break;
+        case STREAM_7:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6]));
+            break;
+        case STREAM_8:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+            break;
+        case STREAM_9:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8]));
+            break;
+        case STREAM_10:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9]));
+            break;
+        case STREAM_11:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10]));
+            break;
+        case STREAM_12:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+            break;
+        case STREAM_13:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12]));
+            break;
+        case STREAM_14:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13]));
+            break;
+        case STREAM_15:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14]));
+            break;
+        case STREAM_16:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+            break;
+        case STREAM_17:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16]));
+            break;
+        case STREAM_18:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17]));
+            break;
+        case STREAM_19:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18]));
+            break;
+        case STREAM_20:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+            break;
+        case STREAM_21:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20]));
+            break;
+        case STREAM_22:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21]));
+            break;
+        case STREAM_23:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22]));
+            break;
+        case STREAM_24:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+            break;
+        case STREAM_25:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24]));
+            break;
+        case STREAM_26:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25]));
+            break;
+        case STREAM_27:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26]));
+            break;
+        case STREAM_28:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+            break;
+        case STREAM_29:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28]));
+            break;
+        case STREAM_30:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29]));
+            break;
+        case STREAM_31:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30]));
+            break;
+        case STREAM_32:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+            break;
+        case STREAM_33:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32]));
+            break;
+        case STREAM_34:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33]));
+            break;
+        case STREAM_35:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34]));
+            break;
+        case STREAM_36:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+            break;
+        case STREAM_37:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36]));
+            break;
+        case STREAM_38:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36],myData->streams[37]));
+            break;
+        default:
+            break;
+    }
+    data->data.iter = iterations;
+#ifdef DEBUG_LIKWID
+    printf(" %d iterations per thread\n", iterations);
+    if (iterations < MIN_ITERATIONS)
+        printf("Sanitizing iterations count per thread to %d\n",MIN_ITERATIONS);
+#endif
+    return NULL;
+}
diff --git a/bench/src/bstrlib.c b/bench/src/bstrlib.c
new file mode 100644
index 0000000..380269c
--- /dev/null
+++ b/bench/src/bstrlib.c
@@ -0,0 +1,2955 @@
+/*
+ * =======================================================================================
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
+ * license and the GPL. Refer to the accompanying documentation for details 
+ * on usage and license.
+ */
+/*
+ * bstrlib.c
+ *
+ * This file is the core module for implementing the bstring functions.
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "bstrlib.h"
+
+/* Optionally include a mechanism for debugging memory */
+
+#if defined(MEMORY_DEBUG) || defined(BSTRLIB_MEMORY_DEBUG)
+#include "memdbg.h"
+#endif
+
+#ifndef bstr__alloc
+#define bstr__alloc(x) malloc (x)
+#endif
+
+#ifndef bstr__free
+#define bstr__free(p) free (p)
+#endif
+
+#ifndef bstr__realloc
+#define bstr__realloc(p,x) realloc ((p), (x))
+#endif
+
+#ifndef bstr__memcpy
+#define bstr__memcpy(d,s,l) memcpy ((d), (s), (l))
+#endif
+
+#ifndef bstr__memmove
+#define bstr__memmove(d,s,l) memmove ((d), (s), (l))
+#endif
+
+#ifndef bstr__memset
+#define bstr__memset(d,c,l) memset ((d), (c), (l))
+#endif
+
+#ifndef bstr__memcmp
+#define bstr__memcmp(d,c,l) memcmp ((d), (c), (l))
+#endif
+
+#ifndef bstr__memchr
+#define bstr__memchr(s,c,l) memchr ((s), (c), (l))
+#endif
+
+/* Just a length safe wrapper for memmove. */
+
+#define bBlockCopy(D,S,L) { if ((L) > 0) bstr__memmove ((D),(S),(L)); }
+
+/* Compute the snapped size for a given requested size.  By snapping to powers
+   of 2 like this, repeated reallocations are avoided. */
+static int snapUpSize (int i) {
+    if (i < 8) {
+        i = 8;
+    } else {
+        unsigned int j;
+        j = (unsigned int) i;
+
+        j |= (j >>  1);
+        j |= (j >>  2);
+        j |= (j >>  4);
+        j |= (j >>  8);        /* Ok, since int >= 16 bits */
+#if (UINT_MAX != 0xffff)
+        j |= (j >> 16);        /* For 32 bit int systems */
+#if (UINT_MAX > 0xffffffffUL)
+        j |= (j >> 32);        /* For 64 bit int systems */
+#endif
+#endif
+        /* Least power of two greater than i */
+        j++;
+        if ((int) j >= i) i = (int) j;
+    }
+    return i;
+}
+
+/*  int balloc (bstring b, int len)
+ *
+ *  Increase the size of the memory backing the bstring b to at least len.
+ */
+int balloc (bstring b, int olen) {
+    int len;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || olen <= 0) {
+        return BSTR_ERR;
+    }
+
+    if (olen >= b->mlen) {
+        unsigned char * x;
+
+        if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+
+        /* Assume probability of a non-moving realloc is 0.125 */
+        if (7 * b->mlen < 8 * b->slen) {
+
+            /* If slen is close to mlen in size then use realloc to reduce
+               the memory defragmentation */
+
+            reallocStrategy:;
+
+            x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+            if (x == NULL) {
+
+                /* Since we failed, try allocating the tighest possible 
+                   allocation */
+
+                if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+                    return BSTR_ERR;
+                }
+            }
+        } else {
+
+            /* If slen is not close to mlen then avoid the penalty of copying
+               the extra bytes that are allocated, but not considered part of
+               the string */
+
+            if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+
+                /* Perhaps there is no available memory for the two 
+                   allocations to be in memory at once */
+
+                goto reallocStrategy;
+
+            } else {
+                if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+                bstr__free (b->data);
+            }
+        }
+        b->data = x;
+        b->mlen = len;
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+
+    return BSTR_OK;
+}
+
+/*  int ballocmin (bstring b, int len)
+ *
+ *  Set the size of the memory backing the bstring b to len or b->slen+1,
+ *  whichever is larger.  Note that repeated use of this function can degrade
+ *  performance.
+ */
+int ballocmin (bstring b, int len) {
+    unsigned char * s;
+
+    if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || len <= 0) {
+        return BSTR_ERR;
+    }
+
+    if (len < b->slen + 1) len = b->slen + 1;
+
+    if (len != b->mlen) {
+        s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+        if (NULL == s) return BSTR_ERR;
+        s[b->slen] = (unsigned char) '\0';
+        b->data = s;
+        b->mlen = len;
+    }
+
+    return BSTR_OK;
+}
+
+/*  bstring bfromcstr (const char * str)
+ *
+ *  Create a bstring which contains the contents of the '\0' terminated char *
+ *  buffer str.
+ */
+bstring bfromcstr (const char * str) {
+bstring b;
+int i;
+size_t j;
+
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
+
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL;
+    b->slen = (int) j;
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    bstr__memcpy (b->data, str, j+1);
+    return b;
+}
+
+/*  bstring bfromcstralloc (int mlen, const char * str)
+ *
+ *  Create a bstring which contains the contents of the '\0' terminated char *
+ *  buffer str.  The memory buffer backing the string is at least len 
+ *  characters in length.
+ */
+bstring bfromcstralloc (int mlen, const char * str) {
+bstring b;
+int i;
+size_t j;
+
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
+
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = (int) j;
+    if (i < mlen) i = mlen;
+
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    bstr__memcpy (b->data, str, j+1);
+    return b;
+}
+
+/*  bstring blk2bstr (const void * blk, int len)
+ *
+ *  Create a bstring which contains the content of the block blk of length 
+ *  len.
+ */
+bstring blk2bstr (const void * blk, int len) {
+bstring b;
+int i;
+
+    if (blk == NULL || len < 0) return NULL;
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = len;
+
+    i = len + (2 - (len != 0));
+    i = snapUpSize (i);
+
+    b->mlen = i;
+
+    b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+    b->data[len] = (unsigned char) '\0';
+
+    return b;
+}
+
+/*  char * bstr2cstr (const_bstring s, char z)
+ *
+ *  Create a '\0' terminated char * buffer which is equal to the contents of 
+ *  the bstring s, except that any contained '\0' characters are converted 
+ *  to the character in z. This returned value should be freed with a 
+ *  bcstrfree () call, by the calling application.
+ */
+char * bstr2cstr (const_bstring b, char z) {
+int i, l;
+char * r;
+
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    l = b->slen;
+    r = (char *) bstr__alloc ((size_t) (l + 1));
+    if (r == NULL) return r;
+
+    for (i=0; i < l; i ++) {
+        r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+    }
+
+    r[l] = (unsigned char) '\0';
+
+    return r;
+}
+
+/*  int bcstrfree (char * s)
+ *
+ *  Frees a C-string generated by bstr2cstr ().  This is normally unnecessary
+ *  since it just wraps a call to bstr__free (), however, if bstr__alloc () 
+ *  and bstr__free () have been redefined as a macros within the bstrlib 
+ *  module (via defining them in memdbg.h after defining 
+ *  BSTRLIB_MEMORY_DEBUG) with some difference in behaviour from the std 
+ *  library functions, then this allows a correct way of freeing the memory 
+ *  that allows higher level code to be independent from these macro 
+ *  redefinitions.
+ */
+int bcstrfree (char * s) {
+    if (s) {
+        bstr__free (s);
+        return BSTR_OK;
+    }
+    return BSTR_ERR;
+}
+
+/*  int bconcat (bstring b0, const_bstring b1)
+ *
+ *  Concatenate the bstring b1 to the bstring b0.
+ */
+int bconcat (bstring b0, const_bstring b1) {
+int len, d;
+bstring aux = (bstring) b1;
+
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+
+    d = b0->slen;
+    len = b1->slen;
+    if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+
+    if (b0->mlen <= d + len + 1) {
+        ptrdiff_t pd = b1->data - b0->data;
+        if (0 <= pd && pd < b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        if (balloc (b0, d + len + 1) != BSTR_OK) {
+            if (aux != b1) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
+
+    bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+    b0->data[d + len] = (unsigned char) '\0';
+    b0->slen = d + len;
+    if (aux != b1) bdestroy (aux);
+    return BSTR_OK;
+}
+
+/*  int bconchar (bstring b, char c)
+/ *
+ *  Concatenate the single character c to the bstring b.
+ */
+int bconchar (bstring b, char c) {
+int d;
+
+    if (b == NULL) return BSTR_ERR;
+    d = b->slen;
+    if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+    b->data[d] = (unsigned char) c;
+    b->data[d + 1] = (unsigned char) '\0';
+    b->slen++;
+    return BSTR_OK;
+}
+
+/*  int bcatcstr (bstring b, const char * s)
+ *
+ *  Concatenate a char * string to a bstring.
+ */
+int bcatcstr (bstring b, const char * s) {
+char * d;
+int i, l;
+
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+
+    /* Optimistically concatenate directly */
+    l = b->mlen - b->slen;
+    d = (char *) &b->data[b->slen];
+    for (i=0; i < l; i++) {
+        if ((*d++ = *s++) == '\0') {
+            b->slen += i;
+            return BSTR_OK;
+        }
+    }
+    b->slen += i;
+
+    /* Need to explicitely resize and concatenate tail */
+    return bcatblk (b, (const void *) s, (int) strlen (s));
+}
+
+/*  int bcatblk (bstring b, const void * s, int len)
+ *
+ *  Concatenate a fixed length buffer to a bstring.
+ */
+int bcatblk (bstring b, const void * s, int len) {
+int nl;
+
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+
+    if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+    if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+
+    bBlockCopy (&b->data[b->slen], s, (size_t) len);
+    b->slen = nl;
+    b->data[nl] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  bstring bstrcpy (const_bstring b)
+ *
+ *  Create a copy of the bstring b.
+ */
+bstring bstrcpy (const_bstring b) {
+bstring b0;
+int i,j;
+
+    /* Attempted to copy an invalid string? */
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+    b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b0 == NULL) {
+        /* Unable to allocate memory for string header */
+        return NULL;
+    }
+
+    i = b->slen;
+    j = snapUpSize (i + 1);
+
+    b0->data = (unsigned char *) bstr__alloc (j);
+    if (b0->data == NULL) {
+        j = i + 1;
+        b0->data = (unsigned char *) bstr__alloc (j);
+        if (b0->data == NULL) {
+            /* Unable to allocate memory for string data */
+            bstr__free (b0);
+            return NULL;
+        }
+    }
+
+    b0->mlen = j;
+    b0->slen = i;
+
+    if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+    b0->data[b0->slen] = (unsigned char) '\0';
+
+    return b0;
+}
+
+/*  int bassign (bstring a, const_bstring b)
+ *
+ *  Overwrite the string a with the contents of string b.
+ */
+int bassign (bstring a, const_bstring b) {
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
+    if (b->slen != 0) {
+        if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data, b->slen);
+    } else {
+        if (a == NULL || a->data == NULL || a->mlen < a->slen || 
+            a->slen < 0 || a->mlen == 0) 
+            return BSTR_ERR;
+    }
+    a->data[b->slen] = (unsigned char) '\0';
+    a->slen = b->slen;
+    return BSTR_OK;
+}
+
+/*  int bassignmidstr (bstring a, const_bstring b, int left, int len)
+ *
+ *  Overwrite the string a with the middle of contents of string b 
+ *  starting from position left and running for a length len.  left and 
+ *  len are clamped to the ends of b as with the function bmidstr.
+ */
+int bassignmidstr (bstring a, const_bstring b, int left, int len) {
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
+
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
+
+    if (len > b->slen - left) len = b->slen - left;
+
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0)
+        return BSTR_ERR;
+
+    if (len > 0) {
+        if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data + left, len);
+        a->slen = len;
+    } else {
+        a->slen = 0;
+    }
+    a->data[a->slen] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  int bassigncstr (bstring a, const char * str)
+ *
+ *  Overwrite the string a with the contents of char * string str.  Note that 
+ *  the bstring a must be a well defined and writable bstring.  If an error 
+ *  occurs BSTR_ERR is returned however a may be partially overwritten.
+ */
+int bassigncstr (bstring a, const char * str) {
+int i;
+size_t len;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == str) 
+        return BSTR_ERR;
+
+    for (i=0; i < a->mlen; i++) {
+        if ('\0' == (a->data[i] = str[i])) {
+            a->slen = i;
+            return BSTR_OK;
+        }
+    }
+
+    a->slen = i;
+    len = strlen (str + i);
+    if (len > INT_MAX || i + len + 1 > INT_MAX ||
+        0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+    bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+    a->slen += (int) len;
+    return BSTR_OK;
+}
+
+/*  int bassignblk (bstring a, const void * s, int len)
+ *
+ *  Overwrite the string a with the contents of the block (s, len).  Note that 
+ *  the bstring a must be a well defined and writable bstring.  If an error 
+ *  occurs BSTR_ERR is returned and a is not overwritten.
+ */
+int bassignblk (bstring a, const void * s, int len) {
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
+        return BSTR_ERR;
+    if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+    bBlockCopy (a->data, s, (size_t) len);
+    a->data[len] = (unsigned char) '\0';
+    a->slen = len;
+    return BSTR_OK;
+}
+
+/*  int btrunc (bstring b, int n)
+ *
+ *  Truncate the bstring to at most n characters.
+ */
+int btrunc (bstring b, int n) {
+    if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b->slen > n) {
+        b->slen = n;
+        b->data[n] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
+}
+
+#define   upcase(c) (toupper ((unsigned char) c))
+#define downcase(c) (tolower ((unsigned char) c))
+#define   wspace(c) (isspace ((unsigned char) c))
+
+/*  int btoupper (bstring b)
+ *
+ *  Convert contents of bstring to upper case.
+ */
+int btoupper (bstring b) {
+int i, len;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) upcase (b->data[i]);
+    }
+    return BSTR_OK;
+}
+
+/*  int btolower (bstring b)
+ *
+ *  Convert contents of bstring to lower case.
+ */
+int btolower (bstring b) {
+int i, len;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) downcase (b->data[i]);
+    }
+    return BSTR_OK;
+}
+
+/*  int bstricmp (const_bstring b0, const_bstring b1)
+ *
+ *  Compare two strings without differentiating between case.  The return 
+ *  value is the difference of the values of the characters where the two 
+ *  strings first differ after lower case transformation, otherwise 0 is 
+ *  returned indicating that the strings are equal.  If the lengths are 
+ *  different, then a difference from 0 is given, but if the first extra 
+ *  character is '\0', then it is taken to be the value UCHAR_MAX+1.
+ */
+int bstricmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+    if ((n = b0->slen) > b1->slen) n = b1->slen;
+    else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+    for (i = 0; i < n; i ++) {
+        v  = (char) downcase (b0->data[i])
+           - (char) downcase (b1->data[i]);
+        if (0 != v) return v;
+    }
+
+    if (b0->slen > n) {
+        v = (char) downcase (b0->data[n]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
+    if (b1->slen > n) {
+        v = - (char) downcase (b1->data[n]);
+        if (v) return v;
+        return - (int) (UCHAR_MAX + 1);
+    }
+    return BSTR_OK;
+}
+
+/*  int bstrnicmp (const_bstring b0, const_bstring b1, int n)
+ *
+ *  Compare two strings without differentiating between case for at most n
+ *  characters.  If the position where the two strings first differ is
+ *  before the nth position, the return value is the difference of the values
+ *  of the characters, otherwise 0 is returned.  If the lengths are different
+ *  and less than n characters, then a difference from 0 is given, but if the 
+ *  first extra character is '\0', then it is taken to be the value 
+ *  UCHAR_MAX+1.
+ */
+int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
+
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v  = (char) downcase (b0->data[i]);
+            v -= (char) downcase (b1->data[i]);
+            if (v != 0) return b0->data[i] - b1->data[i];
+        }
+    }
+
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+    if (b0->slen > m) {
+        v = (char) downcase (b0->data[m]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
+
+    v = - (char) downcase (b1->data[m]);
+    if (v) return v;
+    return - (int) (UCHAR_MAX + 1);
+}
+
+/*  int biseqcaseless (const_bstring b0, const_bstring b1)
+ *
+ *  Compare two strings for equality without differentiating between case.  
+ *  If the strings differ other than in case, 0 is returned, if the strings 
+ *  are the same, 1 is returned, if there is an error, -1 is returned.  If 
+ *  the length of the strings are different, this function is O(1).  '\0' 
+ *  termination characters are not treated in any special way.
+ */
+int biseqcaseless (const_bstring b0, const_bstring b1) {
+int i, n;
+
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    for (i=0, n=b0->slen; i < n; i++) {
+        if (b0->data[i] != b1->data[i]) {
+            unsigned char c = (unsigned char) downcase (b0->data[i]);
+            if (c != (unsigned char) downcase (b1->data[i])) return 0;
+        }
+    }
+    return 1;
+}
+
+/*  int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
+ *
+ *  Compare beginning of string b0 with a block of memory of length len 
+ *  without differentiating between case for equality.  If the beginning of b0
+ *  differs from the memory block other than in case (or if b0 is too short), 
+ *  0 is returned, if the strings are the same, 1 is returned, if there is an 
+ *  error, -1 is returned.  '\0' characters are not treated in any special 
+ *  way.
+ */
+int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+            if (downcase (b0->data[i]) != 
+                downcase (((const unsigned char *) blk)[i])) return 0;
+        }
+    }
+    return 1;
+}
+
+/*
+ * int bltrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the left end of the string.
+ */
+int bltrimws (bstring b) {
+int i, len;
+
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+    for (len = b->slen, i = 0; i < len; i++) {
+        if (!wspace (b->data[i])) {
+            return bdelete (b, 0, i);
+        }
+    }
+
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
+}
+
+/*
+ * int brtrimws (bstring b)
+ *
+ * Delete whitespace contiguous from the right end of the string.
+ */
+int brtrimws (bstring b) {
+int i;
+
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            return BSTR_OK;
+        }
+    }
+
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
+}
+
+/*
+ * int btrimws (bstring b)
+ *
+ * Delete whitespace contiguous from both ends of the string.
+ */
+int btrimws (bstring b) {
+int i, j;
+
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            for (j = 0; wspace (b->data[j]); j++) {}
+            return bdelete (b, 0, j);
+        }
+    }
+
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
+}
+
+/*  int biseq (const_bstring b0, const_bstring b1)
+ *
+ *  Compare the string b0 and b1.  If the strings differ, 0 is returned, if 
+ *  the strings are the same, 1 is returned, if there is an error, -1 is 
+ *  returned.  If the length of the strings are different, this function is
+ *  O(1).  '\0' termination characters are not treated in any special way.
+ */
+int biseq (const_bstring b0, const_bstring b1) {
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    return !bstr__memcmp (b0->data, b1->data, b0->slen);
+}
+
+/*  int bisstemeqblk (const_bstring b0, const void * blk, int len)
+ *
+ *  Compare beginning of string b0 with a block of memory of length len for 
+ *  equality.  If the beginning of b0 differs from the memory block (or if b0 
+ *  is too short), 0 is returned, if the strings are the same, 1 is returned, 
+ *  if there is an error, -1 is returned.  '\0' characters are not treated in 
+ *  any special way.
+ */
+int bisstemeqblk (const_bstring b0, const void * blk, int len) {
+int i;
+
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+    }
+    return 1;
+}
+
+/*  int biseqcstr (const_bstring b, const char *s)
+ *
+ *  Compare the bstring b and char * string s.  The C string s must be '\0' 
+ *  terminated at exactly the length of the bstring b, and the contents 
+ *  between the two must be identical with the bstring b with no '\0' 
+ *  characters for the two contents to be considered equal.  This is 
+ *  equivalent to the condition that their current contents will be always be 
+ *  equal when comparing them in the same format after converting one or the 
+ *  other.  If the strings are equal 1 is returned, if they are unequal 0 is 
+ *  returned and if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstr (const_bstring b, const char * s) {
+int i;
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+    }
+    return s[i] == '\0';
+}
+
+/*  int biseqcstrcaseless (const_bstring b, const char *s)
+ *
+ *  Compare the bstring b and char * string s.  The C string s must be '\0' 
+ *  terminated at exactly the length of the bstring b, and the contents 
+ *  between the two must be identical except for case with the bstring b with 
+ *  no '\0' characters for the two contents to be considered equal.  This is 
+ *  equivalent to the condition that their current contents will be always be 
+ *  equal ignoring case when comparing them in the same format after 
+ *  converting one or the other.  If the strings are equal, except for case, 
+ *  1 is returned, if they are unequal regardless of case 0 is returned and 
+ *  if there is a detectable error BSTR_ERR is returned.
+ */
+int biseqcstrcaseless (const_bstring b, const char * s) {
+int i;
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || 
+            (b->data[i] != (unsigned char) s[i] && 
+             downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+            return BSTR_OK;
+    }
+    return s[i] == '\0';
+}
+
+/*  int bstrcmp (const_bstring b0, const_bstring b1)
+ *
+ *  Compare the string b0 and b1.  If there is an error, SHRT_MIN is returned, 
+ *  otherwise a value less than or greater than zero, indicating that the 
+ *  string pointed to by b0 is lexicographically less than or greater than 
+ *  the string pointed to by b1 is returned.  If the the string lengths are 
+ *  unequal but the characters up until the length of the shorter are equal 
+ *  then a value less than, or greater than zero, indicating that the string 
+ *  pointed to by b0 is shorter or longer than the string pointed to by b1 is 
+ *  returned.  0 is returned if and only if the two strings are the same.  If 
+ *  the length of the strings are different, this function is O(n).  Like its
+ *  standard C library counter part strcmp, the comparison does not proceed 
+ *  past any '\0' termination characters encountered.
+ */
+int bstrcmp (const_bstring b0, const_bstring b1) {
+int i, v, n;
+
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    n = b0->slen; if (n > b1->slen) n = b1->slen;
+    if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+        return BSTR_OK;
+
+    for (i = 0; i < n; i ++) {
+        v = ((char) b0->data[i]) - ((char) b1->data[i]);
+        if (v != 0) return v;
+        if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+    }
+
+    if (b0->slen > n) return 1;
+    if (b1->slen > n) return -1;
+    return BSTR_OK;
+}
+
+/*  int bstrncmp (const_bstring b0, const_bstring b1, int n)
+ *
+ *  Compare the string b0 and b1 for at most n characters.  If there is an 
+ *  error, SHRT_MIN is returned, otherwise a value is returned as if b0 and 
+ *  b1 were first truncated to at most n characters then bstrcmp was called
+ *  with these new strings are paremeters.  If the length of the strings are 
+ *  different, this function is O(n).  Like its standard C library counter 
+ *  part strcmp, the comparison does not proceed past any '\0' termination 
+ *  characters encountered.
+ */
+int bstrncmp (const_bstring b0, const_bstring b1, int n) {
+int i, v, m;
+
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
+
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v = ((char) b0->data[i]) - ((char) b1->data[i]);
+            if (v != 0) return v;
+            if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+        }
+    }
+
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
+
+    if (b0->slen > m) return 1;
+    return -1;
+}
+
+/*  bstring bmidstr (const_bstring b, int left, int len)
+ *
+ *  Create a bstring which is the substring of b starting from position left
+ *  and running for a length len (clamped by the end of the bstring b.)  If
+ *  b is detectably invalid, then NULL is returned.  The section described 
+ *  by (left, len) is clamped to the boundaries of b.
+ */
+bstring bmidstr (const_bstring b, int left, int len) {
+
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
+
+    if (len > b->slen - left) len = b->slen - left;
+
+    if (len <= 0) return bfromcstr ("");
+    return blk2bstr (b->data + left, len);
+}
+
+/*  int bdelete (bstring b, int pos, int len)
+ *
+ *  Removes characters from pos to pos+len-1 inclusive and shifts the tail of 
+ *  the bstring starting from pos+len to pos.  len must be positive for this 
+ *  call to have any effect.  The section of the string described by (pos, 
+ *  len) is clamped to boundaries of the bstring b.
+ */
+int bdelete (bstring b, int pos, int len) {
+    /* Clamp to left side of bstring */
+    if (pos < 0) {
+        len += pos;
+        pos = 0;
+    }
+
+    if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
+        b->mlen < b->slen || b->mlen <= 0) 
+        return BSTR_ERR;
+    if (len > 0 && pos < b->slen) {
+        if (pos + len >= b->slen) {
+            b->slen = pos;
+        } else {
+            bBlockCopy ((char *) (b->data + pos),
+                        (char *) (b->data + pos + len), 
+                        b->slen - (pos+len));
+            b->slen -= len;
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
+}
+
+/*  int bdestroy (bstring b)
+ *
+ *  Free up the bstring.  Note that if b is detectably invalid or not writable
+ *  then no action is performed and BSTR_ERR is returned.  Like a freed memory
+ *  allocation, dereferences, writes or any other action on b after it has 
+ *  been bdestroyed is undefined.
+ */
+int bdestroy (bstring b) {
+    if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+        b->data == NULL)
+        return BSTR_ERR;
+
+    bstr__free (b->data);
+
+    /* In case there is any stale usage, there is one more chance to 
+       notice this error. */
+
+    b->slen = -1;
+    b->mlen = -__LINE__;
+    b->data = NULL;
+
+    bstr__free (b);
+    return BSTR_OK;
+}
+
+/*  int binstr (const_bstring b1, int pos, const_bstring b2)
+ *
+ *  Search for the bstring b2 in b1 starting from position pos, and searching 
+ *  forward.  If it is found then return with the first position where it is 
+ *  found, otherwise return BSTR_ERR.  Note that this is just a brute force 
+ *  string searcher that does not attempt clever things like the Boyer-Moore 
+ *  search algorithm.  Because of this there are many degenerate cases where 
+ *  this can take much longer than it needs to.
+ */
+int binstr (const_bstring b1, int pos, const_bstring b2) {
+int j, ii, ll, lf;
+unsigned char * d0;
+unsigned char c0;
+register unsigned char * d1;
+register unsigned char c1;
+register int i;
+
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
+
+    /* No space to find such a string? */
+    if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return 0;
+
+    i = pos;
+
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
+
+    /* Peel off the b2->slen == 1 case */
+    c0 = d0[0];
+    if (1 == ll) {
+        for (;i < lf; i++) if (c0 == d1[i]) return i;
+        return BSTR_ERR;
+    }
+
+    c1 = c0;
+    j = 0;
+    lf = b1->slen - 1;
+
+    ii = -1;
+    if (i < lf) do {
+        /* Unrolled current character test */
+        if (c1 != d1[i]) {
+            if (c1 != d1[1+i]) {
+                i += 2;
+                continue;
+            }
+            i++;
+        }
+
+        /* Take note if this is the start of a potential match */
+        if (0 == j) ii = i;
+
+        /* Shift the test character down by one */
+        j++;
+        i++;
+
+        /* If this isn't past the last character continue */
+        if (j < ll) {
+            c1 = d0[j];
+            continue;
+        }
+
+        N0:;
+
+        /* If no characters mismatched, then we matched */
+        if (i == ii+j) return ii;
+
+        /* Shift back to the beginning */
+        i -= j;
+        j  = 0;
+        c1 = c0;
+    } while (i < lf);
+
+    /* Deal with last case if unrolling caused a misalignment */
+    if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+
+    return BSTR_ERR;
+}
+
+/*  int binstrr (const_bstring b1, int pos, const_bstring b2)
+ *
+ *  Search for the bstring b2 in b1 starting from position pos, and searching 
+ *  backward.  If it is found then return with the first position where it is 
+ *  found, otherwise return BSTR_ERR.  Note that this is just a brute force 
+ *  string searcher that does not attempt clever things like the Boyer-Moore 
+ *  search algorithm.  Because of this there are many degenerate cases where 
+ *  this can take much longer than it needs to.
+ */
+int binstrr (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
+
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
+
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
+
+    for (;;) {
+        if (d0[j] == d1[i + j]) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
+
+    return BSTR_ERR;
+}
+
+/*  int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ *  Search for the bstring b2 in b1 starting from position pos, and searching 
+ *  forward but without regard to case.  If it is found then return with the 
+ *  first position where it is found, otherwise return BSTR_ERR.  Note that 
+ *  this is just a brute force string searcher that does not attempt clever 
+ *  things like the Boyer-Moore search algorithm.  Because of this there are 
+ *  many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l, ll;
+unsigned char * d0, * d1;
+
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
+
+    l = b1->slen - b2->slen + 1;
+
+    /* No space to find such a string? */
+    if (l <= pos) return BSTR_ERR;
+
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return BSTR_OK;
+
+    i = pos;
+    j = 0;
+
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
+
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= ll) return i;
+        } else {
+            i ++;
+            if (i >= l) break;
+            j=0;
+        }
+    }
+
+    return BSTR_ERR;
+}
+
+/*  int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
+ *
+ *  Search for the bstring b2 in b1 starting from position pos, and searching 
+ *  backward but without regard to case.  If it is found then return with the 
+ *  first position where it is found, otherwise return BSTR_ERR.  Note that 
+ *  this is just a brute force string searcher that does not attempt clever 
+ *  things like the Boyer-Moore search algorithm.  Because of this there are 
+ *  many degenerate cases where this can take much longer than it needs to.
+ */
+int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
+int j, i, l;
+unsigned char * d0, * d1;
+
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
+
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
+
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
+
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
+
+    return BSTR_ERR;
+}
+
+
+/*  int bstrchrp (const_bstring b, int c, int pos)
+ *
+ *  Search for the character c in b forwards from the position pos 
+ *  (inclusive).
+ */
+int bstrchrp (const_bstring b, int c, int pos) {
+unsigned char * p;
+
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+    if (p) return (int) (p - b->data);
+    return BSTR_ERR;
+}
+
+/*  int bstrrchrp (const_bstring b, int c, int pos)
+ *
+ *  Search for the character c in b backwards from the position pos in string 
+ *  (inclusive).
+ */
+int bstrrchrp (const_bstring b, int c, int pos) {
+int i;
+ 
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    for (i=pos; i >= 0; i--) {
+        if (b->data[i] == (unsigned char) c) return i;
+    }
+    return BSTR_ERR;
+}
+
+#if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
+#define LONG_LOG_BITS_QTY (3)
+#define LONG_BITS_QTY (1 << LONG_LOG_BITS_QTY)
+#define LONG_TYPE unsigned char
+
+#define CFCLEN ((1 << CHAR_BIT) / LONG_BITS_QTY)
+struct charField { LONG_TYPE content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
+#define setInCharField(cf,idx) { \
+    unsigned int c = (unsigned int) (idx); \
+    (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+}
+
+#else
+
+#define CFCLEN (1 << CHAR_BIT)
+struct charField { unsigned char content[CFCLEN]; };
+#define testInCharField(cf,c) ((cf)->content[(unsigned char) (c)])
+#define setInCharField(cf,idx) (cf)->content[(unsigned int) (idx)] = ~0
+
+#endif
+
+/* Convert a bstring to charField */
+static int buildCharField (struct charField * cf, const_bstring b) {
+int i;
+    if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+    memset ((void *) cf->content, 0, sizeof (struct charField));
+    for (i=0; i < b->slen; i++) {
+        setInCharField (cf, b->data[i]);
+    }
+    return BSTR_OK;
+}
+
+static void invertCharField (struct charField * cf) {
+int i;
+    for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+}
+
+/* Inner engine for binchr */
+static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
+int i;
+    for (i=pos; i < len; i++) {
+        unsigned char c = (unsigned char) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
+}
+
+/*  int binchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ *  Search for the first position in b0 starting from pos or after, in which 
+ *  one of the characters in b1 is found and return it.  If such a position 
+ *  does not exist in b0, then BSTR_ERR is returned.
+ */
+int binchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+    if (pos < 0 || b0 == NULL || b0->data == NULL ||
+        b0->slen <= pos) return BSTR_ERR;
+    if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/* Inner engine for binchrr */
+static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
+int i;
+    for (i=pos; i >= 0; i--) {
+        unsigned int c = (unsigned int) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
+}
+
+/*  int binchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ *  Search for the last position in b0 no greater than pos, in which one of 
+ *  the characters in b1 is found and return it.  If such a position does not 
+ *  exist in b0, then BSTR_ERR is returned.
+ */
+int binchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+    if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrrCF (b0->data, pos, &chrs);
+}
+
+/*  int bninchr (const_bstring b0, int pos, const_bstring b1);
+ *
+ *  Search for the first position in b0 starting from pos or after, in which 
+ *  none of the characters in b1 is found and return it.  If such a position 
+ *  does not exist in b0, then BSTR_ERR is returned.
+ */
+int bninchr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen <= pos) return BSTR_ERR;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
+}
+
+/*  int bninchrr (const_bstring b0, int pos, const_bstring b1);
+ *
+ *  Search for the last position in b0 no greater than pos, in which none of 
+ *  the characters in b1 is found and return it.  If such a position does not 
+ *  exist in b0, then BSTR_ERR is returned.
+ */
+int bninchrr (const_bstring b0, int pos, const_bstring b1) {
+struct charField chrs;
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrrCF (b0->data, pos, &chrs);
+}
+
+/*  int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
+ *
+ *  Overwrite the string b0 starting at position pos with the string b1. If 
+ *  the position pos is past the end of b0, then the character "fill" is 
+ *  appended as necessary to make up the gap between the end of b0 and pos.
+ *  If b1 is NULL, it behaves as if it were a 0-length string.
+ */
+int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill) {
+int d, newlen;
+ptrdiff_t pd;
+bstring aux = (bstring) b1;
+
+    if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
+        b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+    if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+
+    d = pos;
+
+    /* Aliasing case */
+    if (NULL != aux) {
+        if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        d += aux->slen;
+    }
+
+    /* Increase memory size if necessary */
+    if (balloc (b0, d + 1) != BSTR_OK) {
+        if (aux != b1) bdestroy (aux);
+        return BSTR_ERR;
+    }
+
+    newlen = b0->slen;
+
+    /* Fill in "fill" character as necessary */
+    if (pos > newlen) {
+        bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+        newlen = pos;
+    }
+
+    /* Copy b1 to position pos in b0. */
+    if (aux != NULL) {
+        bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+        if (aux != b1) bdestroy (aux);
+    }
+
+    /* Indicate the potentially increased size of b0 */
+    if (d > newlen) newlen = d;
+
+    b0->slen = newlen;
+    b0->data[newlen] = (unsigned char) '\0';
+
+    return BSTR_OK;
+}
+
+/*  int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
+ *
+ *  Inserts the string b2 into b1 at position pos.  If the position pos is 
+ *  past the end of b1, then the character "fill" is appended as necessary to 
+ *  make up the gap between the end of b1 and pos.  Unlike bsetstr, binsert
+ *  does not allow b2 to be NULL.
+ */
+int binsert (bstring b1, int pos, const_bstring b2, unsigned char fill) {
+int d, l;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+    if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
+        b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    /* Compute the two possible end pointers */
+    d = b1->slen + aux->slen;
+    l = pos + aux->slen;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b1, l + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+        b1->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b1, d + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bBlockCopy (b1->data + l, b1->data + pos, d - l);
+        b1->slen = d;
+    }
+    bBlockCopy (b1->data + pos, aux->data, aux->slen);
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
+}
+
+/*  int breplace (bstring b1, int pos, int len, bstring b2, 
+ *                unsigned char fill)
+ *
+ *  Replace a section of a string from pos for a length len with the string b2.
+ *  fill is used is pos > b1->slen.
+ */
+int breplace (bstring b1, int pos, int len, const_bstring b2, 
+              unsigned char fill) {
+int pl, ret;
+ptrdiff_t pd;
+bstring aux = (bstring) b2;
+
+    if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
+        b2 == NULL || b1->data == NULL || b2->data == NULL || 
+        b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+        b1->mlen <= 0) return BSTR_ERR;
+
+    /* Straddles the end? */
+    if (pl >= b1->slen) {
+        if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+        if (pos + b2->slen < b1->slen) {
+            b1->slen = pos + b2->slen;
+            b1->data[b1->slen] = (unsigned char) '\0';
+        }
+        return ret;
+    }
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    if (aux->slen > len) {
+        if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
+
+    if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+    bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+    b1->slen += aux->slen - len;
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
+}
+
+/*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
+ *                    int pos)
+ *
+ *  Replace all occurrences of a find string with a replace string after a
+ *  given point in a bstring.
+ */
+
+typedef int (*instr_fnptr) (const_bstring s1, int pos, const_bstring s2);
+
+static int findreplaceengine (bstring b, const_bstring find, const_bstring repl, int pos, instr_fnptr instr) {
+int i, ret, slen, mlen, delta, acc;
+int * d;
+int static_d[32];
+ptrdiff_t pd;
+bstring auxf = (bstring) find;
+bstring auxr = (bstring) repl;
+
+    if (b == NULL || b->data == NULL || find == NULL ||
+        find->data == NULL || repl == NULL || repl->data == NULL || 
+        pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
+        b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+    if (pos > b->slen - find->slen) return BSTR_OK;
+
+    /* Alias with find string */
+    pd = (ptrdiff_t) (find->data - b->data);
+    if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+    }
+
+    /* Alias with repl string */
+    pd = (ptrdiff_t) (repl->data - b->data);
+    if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxr = bstrcpy (repl))) {
+            if (auxf != find) bdestroy (auxf);
+            return BSTR_ERR;
+        }
+    }
+
+    delta = auxf->slen - auxr->slen;
+
+    /* in-place replacement since find and replace strings are of equal 
+       length */
+    if (delta == 0) {
+        while ((pos = instr (b, pos, auxf)) >= 0) {
+            bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+            pos += auxf->slen;
+        }
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* shrinking replacement since auxf->slen > auxr->slen */
+    if (delta > 0) {
+        acc = 0;
+
+        while ((i = instr (b, pos, auxf)) >= 0) {
+            if (acc && i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            if (auxr->slen)
+                bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+            acc += delta;
+            pos = i + auxf->slen;
+        }
+
+        if (acc) {
+            i = b->slen;
+            if (i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            b->slen -= acc;
+            b->data[b->slen] = (unsigned char) '\0';
+        }
+
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* expanding replacement since find->slen < repl->slen.  Its a lot 
+       more complicated. */
+
+    mlen = 32;
+    d = (int *) static_d; /* Avoid malloc for trivial cases */
+    acc = slen = 0;
+
+    while ((pos = instr (b, pos, auxf)) >= 0) {
+        if (slen + 1 >= mlen) {
+            int sl;
+            int * t;
+            mlen += mlen;
+            sl = sizeof (int *) * mlen;
+            if (static_d == d) d = NULL;
+            if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+                ret = BSTR_ERR;
+                goto done;
+            }
+            if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+            d = t;
+        }
+        d[slen] = pos;
+        slen++;
+        acc -= delta;
+        pos += auxf->slen;
+        if (pos < 0 || acc < 0) {
+            ret = BSTR_ERR;
+            goto done;
+        }
+    }
+    d[slen] = b->slen;
+
+    if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+        b->slen += acc;
+        for (i = slen-1; i >= 0; i--) {
+            int s, l;
+            s = d[i] + auxf->slen;
+            l = d[i+1] - s;
+            if (l) {
+                bstr__memmove (b->data + s + acc, b->data + s, l);
+            }
+            if (auxr->slen) {
+                bstr__memmove (b->data + s + acc - auxr->slen, 
+                         auxr->data, auxr->slen);
+            }
+            acc += delta;        
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+
+    done:;
+    if (static_d == d) d = NULL;
+    bstr__free (d);
+    if (auxf != find) bdestroy (auxf);
+    if (auxr != repl) bdestroy (auxr);
+    return ret;
+}
+
+/*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
+ *                    int pos)
+ *
+ *  Replace all occurrences of a find string with a replace string after a
+ *  given point in a bstring.
+ */
+int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
+    return findreplaceengine (b, find, repl, pos, binstr);
+}
+
+/*  int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, 
+ *                    int pos)
+ *
+ *  Replace all occurrences of a find string, ignoring case, with a replace 
+ *  string after a given point in a bstring.
+ */
+int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
+    return findreplaceengine (b, find, repl, pos, binstrcaseless);
+}
+
+/*  int binsertch (bstring b, int pos, int len, unsigned char fill)
+ *
+ *  Inserts the character fill repeatedly into b at position pos for a 
+ *  length len.  If the position pos is past the end of b, then the 
+ *  character "fill" is appended as necessary to make up the gap between the 
+ *  end of b and the position pos + len.
+ */
+int binsertch (bstring b, int pos, int len, unsigned char fill) {
+int d, l, i;
+
+    if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+    /* Compute the two possible end pointers */
+    d = b->slen + len;
+    l = pos + len;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+        pos = b->slen;
+        b->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+        for (i = d - 1; i >= l; i--) {
+            b->data[i] = b->data[i - len];
+        }
+        b->slen = d;
+    }
+
+    for (i=pos; i < l; i++) b->data[i] = fill;
+    b->data[b->slen] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  int bpattern (bstring b, int len)
+ *
+ *  Replicate the bstring, b in place, end to end repeatedly until it 
+ *  surpasses len characters, then chop the result to exactly len characters. 
+ *  This function operates in-place.  The function will return with BSTR_ERR 
+ *  if b is NULL or of length 0, otherwise BSTR_OK is returned.
+ */
+int bpattern (bstring b, int len) {
+int i, d;
+
+    d = blength (b);
+    if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+    if (len > 0) {
+        if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+        for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+    }
+    b->data[len] = (unsigned char) '\0';
+    b->slen = len;
+    return BSTR_OK;
+}
+
+#define BS_BUFF_SZ (1024)
+
+/*  int breada (bstring b, bNread readPtr, void * parm)
+ *
+ *  Use a finite buffer fread-like function readPtr to concatenate to the 
+ *  bstring b the entire contents of file-like source data in a roughly 
+ *  efficient way.
+ */
+int breada (bstring b, bNread readPtr, void * parm) {
+int i, l, n;
+
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+
+    i = b->slen;
+    for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+        if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+        l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+        i += l;
+        b->slen = i;
+        if (i < n) break;
+    }
+
+    b->data[i] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  bstring bread (bNread readPtr, void * parm)
+ *
+ *  Use a finite buffer fread-like function readPtr to create a bstring 
+ *  filled with the entire contents of file-like source data in a roughly 
+ *  efficient way.
+ */
+bstring bread (bNread readPtr, void * parm) {
+bstring buff;
+
+    if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+        bdestroy (buff);
+        return NULL;
+    }
+    return buff;
+}
+
+/*  int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ *  Use an fgetc-like single character stream reading function (getcPtr) to 
+ *  obtain a sequence of characters which are concatenated to the end of the
+ *  bstring b.  The stream read is terminated by the passed in terminator 
+ *  parameter.
+ *
+ *  If getcPtr returns with a negative number, or the terminator character 
+ *  (which is appended) is read, then the stream reading is halted and the 
+ *  function returns with a partial result in b.  If there is an empty partial
+ *  result, 1 is returned.  If no characters are read, or there is some other 
+ *  detectable error, BSTR_ERR is returned.
+ */
+int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = 0;
+    e = b->mlen - 2;
+
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
+
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
+
+    return d == 0 && c < 0;
+}
+
+/*  int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
+ *
+ *  Use an fgetc-like single character stream reading function (getcPtr) to 
+ *  obtain a sequence of characters which are concatenated to the end of the
+ *  bstring b.  The stream read is terminated by the passed in terminator 
+ *  parameter.
+ *
+ *  If getcPtr returns with a negative number, or the terminator character 
+ *  (which is appended) is read, then the stream reading is halted and the 
+ *  function returns with a partial result concatentated to b.  If there is 
+ *  an empty partial result, 1 is returned.  If no characters are read, or 
+ *  there is some other detectable error, BSTR_ERR is returned.
+ */
+int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
+int c, d, e;
+
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = b->slen;
+    e = b->mlen - 2;
+
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
+
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
+
+    return d == 0 && c < 0;
+}
+
+/*  bstring bgets (bNgetc getcPtr, void * parm, char terminator)
+ *
+ *  Use an fgetc-like single character stream reading function (getcPtr) to 
+ *  obtain a sequence of characters which are concatenated into a bstring.  
+ *  The stream read is terminated by the passed in terminator function.
+ *
+ *  If getcPtr returns with a negative number, or the terminator character 
+ *  (which is appended) is read, then the stream reading is halted and the 
+ *  result obtained thus far is returned.  If no characters are read, or 
+ *  there is some other detectable error, NULL is returned.
+ */
+bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
+bstring buff;
+
+    if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+        bdestroy (buff);
+        buff = NULL;
+    }
+    return buff;
+}
+
+struct bStream {
+    bstring buff;        /* Buffer for over-reads */
+    void * parm;        /* The stream handle for core stream */
+    bNread readFnPtr;    /* fread compatible fnptr for core stream */
+    int isEOF;        /* track file's EOF state */
+    int maxBuffSz;
+};
+
+/*  struct bStream * bsopen (bNread readPtr, void * parm)
+ *
+ *  Wrap a given open stream (described by a fread compatible function 
+ *  pointer and stream handle) into an open bStream suitable for the bstring 
+ *  library streaming functions.
+ */
+struct bStream * bsopen (bNread readPtr, void * parm) {
+struct bStream * s;
+
+    if (readPtr == NULL) return NULL;
+    s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+    if (s == NULL) return NULL;
+    s->parm = parm;
+    s->buff = bfromcstr ("");
+    s->readFnPtr = readPtr;
+    s->maxBuffSz = BS_BUFF_SZ;
+    s->isEOF = 0;
+    return s;
+}
+
+/*  int bsbufflength (struct bStream * s, int sz)
+ *
+ *  Set the length of the buffer used by the bStream.  If sz is zero, the 
+ *  length is not set.  This function returns with the previous length.
+ */
+int bsbufflength (struct bStream * s, int sz) {
+int oldSz;
+    if (s == NULL || sz < 0) return BSTR_ERR;
+    oldSz = s->maxBuffSz;
+    if (sz > 0) s->maxBuffSz = sz;
+    return oldSz;
+}
+
+int bseof (const struct bStream * s) {
+    if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+    return s->isEOF && (s->buff->slen == 0);
+}
+
+/*  void * bsclose (struct bStream * s)
+ *
+ *  Close the bStream, and return the handle to the stream that was originally
+ *  used to open the given stream.
+ */
+void * bsclose (struct bStream * s) {
+void * parm;
+    if (s == NULL) return NULL;
+    s->readFnPtr = NULL;
+    if (s->buff) bdestroy (s->buff);
+    s->buff = NULL;
+    parm = s->parm;
+    s->parm = NULL;
+    s->isEOF = 1;
+    bstr__free (s);
+    return parm;
+}
+
+/*  int bsreadlna (bstring r, struct bStream * s, char terminator)
+ *
+ *  Read a bstring terminated by the terminator character or the end of the
+ *  stream from the bStream (s) and return it into the parameter r.  This 
+ *  function may read additional characters from the core stream that are not 
+ *  returned, but will be retained for subsequent read operations.
+ */
+int bsreadlna (bstring r, struct bStream * s, char terminator) {
+int i, l, ret, rlo;
+char * b;
+struct tagbstring x;
+
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+        r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = terminator; /* Set sentinel */
+    for (i=0; b[i] != terminator; i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+        b[l] = terminator; /* Set sentinel */
+        for (i=0; b[i] != terminator; i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  int bsreadlnsa (bstring r, struct bStream * s, bstring term)
+ *
+ *  Read a bstring terminated by any character in the term string or the end 
+ *  of the stream from the bStream (s) and return it into the parameter r.  
+ *  This function may read additional characters from the core stream that 
+ *  are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlnsa (bstring r, struct bStream * s, const_bstring term) {
+int i, l, ret, rlo;
+unsigned char * b;
+struct tagbstring x;
+struct charField cf;
+
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+        term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+        r->mlen < r->slen) return BSTR_ERR;
+    if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+    if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (unsigned char *) s->buff->data;
+    x.data = b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = term->data[0]; /* Set sentinel */
+    for (i=0; !testInCharField (&cf, b[i]); i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (unsigned char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+
+        b[l] = term->data[0]; /* Set sentinel */
+        for (i=0; !testInCharField (&cf, b[i]); i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
+}
+
+/*  int bsreada (bstring r, struct bStream * s, int n)
+ *
+ *  Read a bstring of length n (or, if it is fewer, as many bytes as is 
+ *  remaining) from the bStream.  This function may read additional 
+ *  characters from the core stream that are not returned, but will be 
+ *  retained for subsequent read operations.  This function will not read
+ *  additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsreada (bstring r, struct bStream * s, int n) {
+int l, ret, orslen;
+char * b;
+struct tagbstring x;
+
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+
+    n += r->slen;
+    if (n <= 0) return BSTR_ERR;
+
+    l = s->buff->slen;
+
+    orslen = r->slen;
+
+    if (0 == l) {
+        if (s->isEOF) return BSTR_ERR;
+        if (r->mlen > n) {
+            l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+            if (0 >= l || l > n - r->slen) {
+                s->isEOF = 1;
+                return BSTR_ERR;
+            }
+            r->slen += l;
+            r->data[r->slen] = (unsigned char) '\0';
+            return 0;
+        }
+    }
+
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
+
+    do {
+        if (l + r->slen >= n) {
+            x.slen = n - r->slen;
+            ret = bconcat (r, &x);
+            s->buff->slen = l;
+            if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+            return BSTR_ERR & -(r->slen == orslen);
+        }
+
+        x.slen = l;
+        if (BSTR_OK != bconcat (r, &x)) break;
+
+        l = n - r->slen;
+        if (l > s->maxBuffSz) l = s->maxBuffSz;
+
+        l = (int) s->readFnPtr (b, 1, l, s->parm);
+
+    } while (l > 0);
+    if (l < 0) l = 0;
+    if (l == 0) s->isEOF = 1;
+    s->buff->slen = l;
+    return BSTR_ERR & -(r->slen == orslen);
+}
+
+/*  int bsreadln (bstring r, struct bStream * s, char terminator)
+ *
+ *  Read a bstring terminated by the terminator character or the end of the
+ *  stream from the bStream (s) and return it into the parameter r.  This 
+ *  function may read additional characters from the core stream that are not 
+ *  returned, but will be retained for subsequent read operations.
+ */
+int bsreadln (bstring r, struct bStream * s, char terminator) {
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+        return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlna (r, s, terminator);
+}
+
+/*  int bsreadlns (bstring r, struct bStream * s, bstring term)
+ *
+ *  Read a bstring terminated by any character in the term string or the end 
+ *  of the stream from the bStream (s) and return it into the parameter r.  
+ *  This function may read additional characters from the core stream that 
+ *  are not returned, but will be retained for subsequent read operations.
+ */
+int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
+     || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+    if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+    if (term->slen < 1) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlnsa (r, s, term);
+}
+
+/*  int bsread (bstring r, struct bStream * s, int n)
+ *
+ *  Read a bstring of length n (or, if it is fewer, as many bytes as is 
+ *  remaining) from the bStream.  This function may read additional 
+ *  characters from the core stream that are not returned, but will be 
+ *  retained for subsequent read operations.  This function will not read
+ *  additional characters from the core stream beyond virtual stream pointer.
+ */
+int bsread (bstring r, struct bStream * s, int n) {
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || n <= 0) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreada (r, s, n);
+}
+
+/*  int bsunread (struct bStream * s, const_bstring b)
+ *
+ *  Insert a bstring into the bStream at the current position.  These 
+ *  characters will be read prior to those that actually come from the core 
+ *  stream.
+ */
+int bsunread (struct bStream * s, const_bstring b) {
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return binsert (s->buff, 0, b, (unsigned char) '?');
+}
+
+/*  int bspeek (bstring r, const struct bStream * s)
+ *
+ *  Return the currently buffered characters from the bStream that will be 
+ *  read prior to reads from the core stream.
+ */
+int bspeek (bstring r, const struct bStream * s) {
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return bassign (r, s->buff);
+}
+
+/*  bstring bjoin (const struct bstrList * bl, const_bstring sep);
+ *
+ *  Join the entries of a bstrList into one bstring by sequentially 
+ *  concatenating them with the sep string in between.  If there is an error 
+ *  NULL is returned, otherwise a bstring with the correct result is returned.
+ */
+bstring bjoin (const struct bstrList * bl, const_bstring sep) {
+bstring b;
+int i, c, v;
+
+    if (bl == NULL || bl->qty < 0) return NULL;
+    if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+    for (i = 0, c = 1; i < bl->qty; i++) {
+        v = bl->entry[i]->slen;
+        if (v < 0) return NULL;    /* Invalid input */
+        c += v;
+        if (c < 0) return NULL;    /* Wrap around ?? */
+    }
+
+    if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL; /* Out of memory */
+    b->data = (unsigned char *) bstr__alloc (c);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    b->mlen = c;
+    b->slen = c-1;
+
+    for (i = 0, c = 0; i < bl->qty; i++) {
+        if (i > 0 && sep != NULL) {
+            bstr__memcpy (b->data + c, sep->data, sep->slen);
+            c += sep->slen;
+        }
+        v = bl->entry[i]->slen;
+        bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+        c += v;
+    }
+    b->data[c] = (unsigned char) '\0';
+    return b;
+}
+
+#define BSSSC_BUFF_LEN (256)
+
+/*  int bssplitscb (struct bStream * s, const_bstring splitStr, 
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ *  Iterate the set of disjoint sequential substrings read from a stream 
+ *  divided by any of the characters in splitStr.  An empty splitStr causes 
+ *  the whole stream to be iterated once.
+ *
+ *  Note: At the point of calling the cb function, the bStream pointer is 
+ *  pointed exactly at the position right after having read the split 
+ *  character.  The cb function can act on the stream by causing the bStream
+ *  pointer to move, and bssplitscb will continue by starting the next split
+ *  at the position of the pointer after the return from cb.
+ *
+ *  However, if the cb causes the bStream s to be destroyed then the cb must
+ *  return with a negative value, otherwise bssplitscb will continue in an 
+ *  undefined manner.
+ */
+int bssplitscb (struct bStream * s, const_bstring splitStr, 
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+struct charField chrs;
+bstring buff;
+int i, p, ret;
+
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+        if ((ret = cb (parm, 0, buff)) > 0) 
+            ret = 0;
+    } else {
+        buildCharField (&chrs, splitStr);
+        ret = p = i = 0;
+        for (;;) {
+            if (i >= buff->slen) {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (i >= buff->slen) {
+                    if (0 < (ret = cb (parm, p, buff))) ret = 0;
+                    break;
+                }
+            }
+            if (testInCharField (&chrs, buff->data[i])) {
+                struct tagbstring t;
+                unsigned char c;
+
+                blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+                if ((ret = bsunread (s, &t)) < 0) break;
+                buff->slen = i;
+                c = buff->data[i];
+                buff->data[i] = (unsigned char) '\0';
+                if ((ret = cb (parm, p, buff)) < 0) break;
+                buff->data[i] = c;
+                buff->slen = 0;
+                p += i + 1;
+                i = -1;
+            }
+            i++;
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
+}
+
+/*  int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *
+ *  Iterate the set of disjoint sequential substrings read from a stream 
+ *  divided by the entire substring splitStr.  An empty splitStr causes 
+ *  each character of the stream to be iterated.
+ *
+ *  Note: At the point of calling the cb function, the bStream pointer is 
+ *  pointed exactly at the position right after having read the split 
+ *  character.  The cb function can act on the stream by causing the bStream
+ *  pointer to move, and bssplitscb will continue by starting the next split
+ *  at the position of the pointer after the return from cb.
+ *
+ *  However, if the cb causes the bStream s to be destroyed then the cb must
+ *  return with a negative value, otherwise bssplitscb will continue in an 
+ *  undefined manner.
+ */
+int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+bstring buff;
+int i, p, ret;
+
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+            if ((ret = cb (parm, 0, buff)) < 0) {
+                bdestroy (buff);
+                return ret;
+            }
+            buff->slen = 0;
+        }
+        return BSTR_OK;
+    } else {
+        ret = p = i = 0;
+        for (i=p=0;;) {
+            if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+                struct tagbstring t;
+                blk2tbstr (t, buff->data, ret);
+                i = ret + splitStr->slen;
+                if ((ret = cb (parm, p, &t)) < 0) break;
+                p += i;
+                bdelete (buff, 0, i);
+            } else {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (bseof (s)) {
+                    if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+                    break;
+                }
+            }
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
+}
+
+/*  int bstrListCreate (void)
+ *
+ *  Create a bstrList.
+ */
+struct bstrList * bstrListCreate (void) {
+struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (sl) {
+        sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+        if (!sl->entry) {
+            bstr__free (sl);
+            sl = NULL;
+        } else {
+            sl->qty = 0;
+            sl->mlen = 1;
+        }
+    }
+    return sl;
+}
+
+/*  int bstrListDestroy (struct bstrList * sl)
+ *
+ *  Destroy a bstrList that has been created by bsplit, bsplits or bstrListCreate.
+ */
+int bstrListDestroy (struct bstrList * sl) {
+int i;
+    if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+    for (i=0; i < sl->qty; i++) {
+        if (sl->entry[i]) {
+            bdestroy (sl->entry[i]);
+            sl->entry[i] = NULL;
+        }
+    }
+    sl->qty  = -1;
+    sl->mlen = -1;
+    bstr__free (sl->entry);
+    sl->entry = NULL;
+    bstr__free (sl);
+    return BSTR_OK;
+}
+
+/*  int bstrListAlloc (struct bstrList * sl, int msz)
+ *
+ *  Ensure that there is memory for at least msz number of entries for the
+ *  list.
+ */
+int bstrListAlloc (struct bstrList * sl, int msz) {
+bstring * l;
+int smsz;
+size_t nsz;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (sl->mlen >= msz) return BSTR_OK;
+    smsz = snapUpSize (msz);
+    nsz = ((size_t) smsz) * sizeof (bstring);
+    if (nsz < (size_t) smsz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) {
+        smsz = msz;
+        nsz = ((size_t) smsz) * sizeof (bstring);
+        l = (bstring *) bstr__realloc (sl->entry, nsz);
+        if (!l) return BSTR_ERR;
+    }
+    sl->mlen = smsz;
+    sl->entry = l;
+    return BSTR_OK;
+}
+
+/*  int bstrListAllocMin (struct bstrList * sl, int msz)
+ *
+ *  Try to allocate the minimum amount of memory for the list to include at
+ *  least msz entries or sl->qty whichever is greater.
+ */
+int bstrListAllocMin (struct bstrList * sl, int msz) {
+bstring * l;
+size_t nsz;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (msz < sl->qty) msz = sl->qty;
+    if (sl->mlen == msz) return BSTR_OK;
+    nsz = ((size_t) msz) * sizeof (bstring);
+    if (nsz < (size_t) msz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) return BSTR_ERR;
+    sl->mlen = msz;
+    sl->entry = l;
+    return BSTR_OK;
+}
+
+/*  int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ *  Iterate the set of disjoint sequential substrings over str divided by the
+ *  character in splitChar.
+ *
+ *  Note: Non-destructive modification of str from within the cb function 
+ *  while performing this split is not undefined.  bsplitcb behaves in 
+ *  sequential lock step with calls to cb.  I.e., after returning from a cb 
+ *  that return a non-negative integer, bsplitcb continues from the position 
+ *  1 character after the last detected split character and it will halt 
+ *  immediately if the length of str falls below this point.  However, if the 
+ *  cb function destroys str, then it *must* return with a negative value, 
+ *  otherwise bsplitcb will continue in an undefined manner.
+ */
+int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
+        return BSTR_ERR;
+
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (str->data[i] == splitChar) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
+}
+
+/*  int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ *  Iterate the set of disjoint sequential substrings over str divided by any 
+ *  of the characters in splitStr.  An empty splitStr causes the whole str to
+ *  be iterated once.
+ *
+ *  Note: Non-destructive modification of str from within the cb function 
+ *  while performing this split is not undefined.  bsplitscb behaves in 
+ *  sequential lock step with calls to cb.  I.e., after returning from a cb 
+ *  that return a non-negative integer, bsplitscb continues from the position 
+ *  1 character after the last detected split character and it will halt 
+ *  immediately if the length of str falls below this point.  However, if the 
+ *  cb function destroys str, then it *must* return with a negative value, 
+ *  otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
+struct charField chrs;
+int i, p, ret;
+
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+    if (splitStr->slen == 0) {
+        if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+        return ret;
+    }
+
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+    buildCharField (&chrs, splitStr);
+
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (testInCharField (&chrs, str->data[i])) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
+}
+
+/*  int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
+ *
+ *  Iterate the set of disjoint sequential substrings over str divided by the 
+ *  substring splitStr.  An empty splitStr causes the whole str to be 
+ *  iterated once.
+ *
+ *  Note: Non-destructive modification of str from within the cb function 
+ *  while performing this split is not undefined.  bsplitstrcb behaves in 
+ *  sequential lock step with calls to cb.  I.e., after returning from a cb 
+ *  that return a non-negative integer, bsplitscb continues from the position 
+ *  1 character after the last detected split character and it will halt 
+ *  immediately if the length of str falls below this point.  However, if the 
+ *  cb function destroys str, then it *must* return with a negative value, 
+ *  otherwise bsplitscb will continue in an undefined manner.
+ */
+int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
+int i, p, ret;
+
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (0 == splitStr->slen) {
+        for (i=pos; i < str->slen; i++) {
+            if ((ret = cb (parm, i, 1)) < 0) return ret;
+        }
+        return BSTR_OK;
+    }
+
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+
+    for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+        if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+            if ((ret = cb (parm, p, i - p)) < 0) return ret;
+            i += splitStr->slen;
+            p = i;
+        }
+    }
+    if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+    return BSTR_OK;
+}
+
+struct genBstrList {
+    bstring b;
+    struct bstrList * bl;
+};
+
+static int bscb (void * parm, int ofs, int len) {
+struct genBstrList * g = (struct genBstrList *) parm;
+    if (g->bl->qty >= g->bl->mlen) {
+        int mlen = g->bl->mlen * 2;
+        bstring * tbl;
+
+        while (g->bl->qty >= mlen) {
+            if (mlen < g->bl->mlen) return BSTR_ERR;
+            mlen += mlen;
+        }
+
+        tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+        if (tbl == NULL) return BSTR_ERR;
+
+        g->bl->entry = tbl;
+        g->bl->mlen = mlen;
+    }
+
+    g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+    g->bl->qty++;
+    return BSTR_OK;
+}
+
+/*  struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
+ *
+ *  Create an array of sequential substrings from str divided by the character
+ *  splitChar.  
+ */
+struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
+struct genBstrList g;
+
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
+
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
+}
+
+/*  struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
+ *
+ *  Create an array of sequential substrings from str divided by the entire
+ *  substring splitStr.
+ */
+struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
+
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
+}
+
+/*  struct bstrList * bsplits (const_bstring str, bstring splitStr)
+ *
+ *  Create an array of sequential substrings from str divided by any of the 
+ *  characters in splitStr.  An empty splitStr causes a single entry bstrList
+ *  containing a copy of str to be returned.
+ */
+struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
+struct genBstrList g;
+
+    if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
+        splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+        return NULL;
+
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+
+    if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
+}
+
+#if defined (__TURBOC__) && !defined (__BORLANDC__)
+# ifndef BSTRLIB_NOVSNP
+#  define BSTRLIB_NOVSNP
+# endif
+#endif
+
+/* Give WATCOM C/C++, MSVC some latitude for their non-support of vsnprintf */
+#if defined(__WATCOMC__) || defined(_MSC_VER)
+#define exvsnprintf(r,b,n,f,a) {r = _vsnprintf (b,n,f,a);}
+#else
+#ifdef BSTRLIB_NOVSNP
+/* This is just a hack.  If you are using a system without a vsnprintf, it is 
+   not recommended that bformat be used at all. */
+#define exvsnprintf(r,b,n,f,a) {vsprintf (b,f,a); r = -1;}
+#define START_VSNBUFF (256)
+#else
+
+#ifdef __GNUC__
+/* Something is making gcc complain about this prototype not being here, so 
+   I've just gone ahead and put it in. */
+extern int vsnprintf (char *buf, size_t count, const char *format, va_list arg);
+#endif
+
+#define exvsnprintf(r,b,n,f,a) {r = vsnprintf (b,n,f,a);}
+#endif
+#endif
+
+#if !defined (BSTRLIB_NOVSNP)
+
+#ifndef START_VSNBUFF
+#define START_VSNBUFF (16)
+#endif
+
+/* On IRIX vsnprintf returns n-1 when the operation would overflow the target 
+   buffer, WATCOM and MSVC both return -1, while C99 requires that the 
+   returned value be exactly what the length would be if the buffer would be
+   large enough.  This leads to the idea that if the return value is larger 
+   than n, then changing n to the return value will reduce the number of
+   iterations required. */
+
+/*  int bformata (bstring b, const char * fmt, ...)
+ *
+ *  After the first parameter, it takes the same parameters as printf (), but 
+ *  rather than outputting results to stdio, it appends the results to 
+ *  a bstring which contains what would have been output. Note that if there 
+ *  is an early generation of a '\0' character, the bstring will be truncated 
+ *  to this end point.
+ */
+int bformata (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
+
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
+
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
+
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
+
+        if (buff->slen < n) break;
+
+        if (r > n) n = r; else n += n;
+
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
+
+    r = bconcat (b, buff);
+    bdestroy (buff);
+    return r;
+}
+
+/*  int bassignformat (bstring b, const char * fmt, ...)
+ *
+ *  After the first parameter, it takes the same parameters as printf (), but 
+ *  rather than outputting results to stdio, it outputs the results to 
+ *  the bstring parameter b. Note that if there is an early generation of a 
+ *  '\0' character, the bstring will be truncated to this end point.
+ */
+int bassignformat (bstring b, const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
+
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
+
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
+
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
+
+        if (buff->slen < n) break;
+
+        if (r > n) n = r; else n += n;
+
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
+
+    r = bassign (b, buff);
+    bdestroy (buff);
+    return r;
+}
+
+/*  bstring bformat (const char * fmt, ...)
+ *
+ *  Takes the same parameters as printf (), but rather than outputting results
+ *  to stdio, it forms a bstring which contains what would have been output.
+ *  Note that if there is an early generation of a '\0' character, the 
+ *  bstring will be truncated to this end point.
+ */
+bstring bformat (const char * fmt, ...) {
+va_list arglist;
+bstring buff;
+int n, r;
+
+    if (fmt == NULL) return NULL;
+
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
+
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+    }
+
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
+
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
+
+        if (buff->slen < n) break;
+
+        if (r > n) n = r; else n += n;
+
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return NULL;
+        }
+    }
+
+    return buff;
+}
+
+/*  int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
+ *
+ *  The bvcformata function formats data under control of the format control 
+ *  string fmt and attempts to append the result to b.  The fmt parameter is 
+ *  the same as that of the printf function.  The variable argument list is 
+ *  replaced with arglist, which has been initialized by the va_start macro.
+ *  The size of the output is upper bounded by count.  If the required output
+ *  exceeds count, the string b is not augmented with any contents and a value
+ *  below BSTR_ERR is returned.  If a value below -count is returned then it
+ *  is recommended that the negative of this value be used as an update to the
+ *  count in a subsequent pass.  On other errors, such as running out of 
+ *  memory, parameter errors or numeric wrap around BSTR_ERR is returned.  
+ *  BSTR_OK is returned when the output is successfully generated and 
+ *  appended to b.
+ *
+ *  Note: There is no sanity checking of arglist, and this function is
+ *  destructive of the contents of b from the b->slen point onward.  If there 
+ *  is an early generation of a '\0' character, the bstring will be truncated 
+ *  to this end point.
+ */
+int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
+int n, r, l;
+
+    if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+     || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+
+    if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+    if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+
+    exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+
+    /* Did the operation complete successfully within bounds? */
+
+    if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
+        b->slen = l;
+        return BSTR_OK;
+    }
+
+    /* Abort, since the buffer was not large enough.  The return value 
+       tries to help set what the retry length should be. */
+
+    b->data[b->slen] = '\0';
+    if (r > count+1) l = r; else {
+        l = count+count;
+        if (count > l) l = INT_MAX;
+    }
+    n = -l;
+    if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+    return n;
+}
+
+#endif
diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c
new file mode 100644
index 0000000..8a4c429
--- /dev/null
+++ b/bench/src/strUtil.c
@@ -0,0 +1,319 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  strUtil.c
+ *
+ *      Description:  Utility string routines building upon bstrlib
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <strUtil.h>
+#include <math.h>
+#include <likwid.h>
+
+static int str2int(const char* str)
+{
+    char* endptr;
+    errno = 0;
+    unsigned long val;
+    val = strtoul(str, &endptr, 10);
+
+    if ((errno == ERANGE && val == LONG_MAX)
+        || (errno != 0 && val == 0))
+    {
+        fprintf(stderr, "Value in string %s out of range\n", str);
+        return -EINVAL;
+    }
+
+    if (endptr == str)
+    {
+        fprintf(stderr, "No digits were found in %s\n", str);
+        return -EINVAL;
+    }
+
+    return (int) val;
+}
+
+uint64_t bstr_to_doubleSize(const_bstring str, DataType type)
+{
+    int ret;
+    bstring unit = bmidstr(str, blength(str)-2, 2);
+    bstring sizeStr = bmidstr(str, 0, blength(str)-2);
+    uint64_t sizeU = 0;
+    uint64_t junk = 0;
+    uint64_t bytesize = 0;
+    if (blength(sizeStr) == 0)
+    {
+        return 0;
+    }
+    ret = str2int(bdata(sizeStr));
+    if (ret >= 0)
+    {
+        sizeU = str2int(bdata(sizeStr));
+    }
+    else
+    {
+        return 0;
+    }
+
+    switch (type)
+    {
+        case SINGLE:
+            bytesize = sizeof(float);
+            break;
+
+        case DOUBLE:
+            bytesize = sizeof(double);
+            break;
+
+        case INT:
+            bytesize = sizeof(int);
+            break;
+    }
+
+    if ((biseqcstr(unit, "kB"))||(biseqcstr(unit, "KB")))
+    {
+        junk = (sizeU *1000)/bytesize;
+    }
+    else if (biseqcstr(unit, "MB"))
+    {
+        junk = (sizeU *1000000)/bytesize;
+    }
+    else if (biseqcstr(unit, "GB"))
+    {
+        junk = (sizeU *1000000000)/bytesize;
+    }
+    else if (biseqcstr(unit, "B"))
+    {
+        junk = (sizeU)/bytesize;
+    }
+    bdestroy(unit);
+    bdestroy(sizeStr);
+    return junk;
+}
+
+
+bstring parse_workgroup(Workgroup* group, const_bstring str, DataType type)
+{
+    CpuTopology_t topo;
+    struct bstrList* tokens;
+    bstring cpustr;
+    int numThreads = 0;
+    bstring domain;
+
+
+    tokens = bsplit(str,':');
+    if (tokens->qty == 2)
+    {
+        topo = get_cpuTopology();
+        numThreads = topo->activeHWThreads;
+        cpustr = bformat("E:%s:%d", bdata(tokens->entry[0]), numThreads );
+    }
+    else if (tokens->qty == 3)
+    {
+        cpustr = bformat("E:%s:%s", bdata(tokens->entry[0]), bdata(tokens->entry[2]));
+        numThreads = str2int(bdata(tokens->entry[2]));
+        if (numThreads < 0)
+        {
+            fprintf(stderr, "Cannot convert %s to integer\n", bdata(tokens->entry[2]));
+            bstrListDestroy(tokens);
+            return NULL;
+        }
+    }
+    else if (tokens->qty == 5)
+    {
+        cpustr = bformat("E:%s:%s:%s:%s", bdata(tokens->entry[0]),
+                                          bdata(tokens->entry[2]),
+                                          bdata(tokens->entry[3]),
+                                          bdata(tokens->entry[4]));
+        numThreads = str2int(bdata(tokens->entry[2]));
+        if (numThreads < 0)
+        {
+            fprintf(stderr, "Cannot convert %s to integer\n", bdata(tokens->entry[2]));
+            bstrListDestroy(tokens);
+            return NULL;
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Misformated workgroup string\n");
+        bstrListDestroy(tokens);
+        return NULL;
+    }
+
+    group->size = bstr_to_doubleSize(tokens->entry[1], type);
+    if (group->size == 0)
+    {
+        fprintf(stderr, "Stream size cannot be read, should look like <domain>:<size>\n");
+        bstrListDestroy(tokens);
+        return NULL;
+    }
+    group->processorIds = (int*) malloc(numThreads * sizeof(int));
+    if (group->processorIds == NULL)
+    {
+        fprintf(stderr, "No more memory to allocate list of processors\n");
+        bstrListDestroy(tokens);
+        return NULL;
+    }
+    group->numberOfThreads = numThreads;
+    if (cpustr_to_cpulist(bdata(cpustr),group->processorIds, numThreads) < 0 )
+    {
+        free(group->processorIds);
+        bstrListDestroy(tokens);
+        return NULL;
+    }
+    domain = bstrcpy(tokens->entry[0]);
+    bdestroy(cpustr);
+    bstrListDestroy(tokens);
+    return domain;
+}
+
+int parse_streams(Workgroup* group, const_bstring str, int numberOfStreams)
+{
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
+    tokens = bsplit(str,',');
+
+    if (tokens->qty < numberOfStreams)
+    {
+        fprintf(stderr, "Error: Testcase requires at least %d streams\n", numberOfStreams);
+        bstrListDestroy(tokens);
+        return -1;
+    }
+
+    group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+    if (group->streams == NULL)
+    {
+        bstrListDestroy(tokens);
+        return -1;
+    }
+    for (int i=0; i<numberOfStreams; i++)
+    {
+        subtokens = bsplit(tokens->entry[i],':');
+        if (subtokens->qty >= 2)
+        {
+            int index = str2int(bdata(subtokens->entry[0]));
+            if ((index < 0) && (index >= numberOfStreams))
+            {
+                free(group->streams);
+                bstrListDestroy(subtokens);
+                bstrListDestroy(tokens);
+                return -1;
+            }
+            group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+            group->streams[index].offset = 0;
+            if (subtokens->qty == 3)
+            {
+                group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
+                if (group->streams[index].offset < 0)
+                {
+                free(group->streams);
+                bstrListDestroy(subtokens);
+                bstrListDestroy(tokens);
+                return -1;
+                }
+            }
+        }
+        else
+        {
+            fprintf(stderr, "Error in parsing stream definition %s\n", bdata(tokens->entry[i]));
+            bstrListDestroy(subtokens);
+            bstrListDestroy(tokens);
+            free(group->streams);
+            return -1;
+        }
+        bstrListDestroy(subtokens);
+    }
+
+    bstrListDestroy(tokens);
+    return 0;
+}
+
+int bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams)
+{
+    int parseStreams = 0;
+    struct bstrList* tokens;
+    tokens = bsplit(str,'-');
+    bstring domain;
+    if (tokens->qty == 2)
+    {
+        domain = parse_workgroup(group, tokens->entry[0], type);
+        if (domain == NULL)
+        {
+            bstrListDestroy(tokens);
+            return 1;
+        }
+        parse_streams(group, tokens->entry[1], numberOfStreams);
+        bdestroy(domain);
+    }
+    else if (tokens->qty == 1)
+    {
+        domain = parse_workgroup(group, tokens->entry[0], type);
+        if (domain == NULL)
+        {
+            bstrListDestroy(tokens);
+            return 1;
+        }
+        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+        if (group->streams == NULL)
+        {
+            bstrListDestroy(tokens);
+            return 1;
+        }
+        for (int i = 0; i< numberOfStreams; i++)
+        {
+            group->streams[i].domain = bstrcpy(domain);
+            group->streams[i].offset = 0;
+        }
+        bdestroy(domain);
+    }
+    else
+    {
+        fprintf(stderr, "Error in parsing workgroup string %s\n", bdata(str));
+        bstrListDestroy(tokens);
+        return 1;
+    }
+    bstrListDestroy(tokens);
+    group->size /= numberOfStreams;
+    return 0;
+}
+
+void workgroups_destroy(Workgroup** groupList, int numberOfGroups, int numberOfStreams)
+{
+    int i = 0, j = 0;
+    if (groupList == NULL)
+        return;
+    if (*groupList == NULL)
+        return;
+    Workgroup* list = *groupList;
+    for (i = 0; i < numberOfGroups; i++)
+    {
+        free(list[i].processorIds);
+        for (j = 0; j < numberOfStreams; j++)
+        {
+            bdestroy(list[i].streams[j].domain);
+        }
+        free(list[i].streams);
+    }
+    free(list);
+}
diff --git a/bench/src/threads.c b/bench/src/threads.c
new file mode 100644
index 0000000..70a90ec
--- /dev/null
+++ b/bench/src/threads.c
@@ -0,0 +1,293 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads.c
+ *
+ *      Description:  High level interface to pthreads
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <errno.h>
+#include <threads.h>
+
+
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+pthread_barrier_t threads_barrier;
+ThreadData* threads_data;
+ThreadGroup* threads_groups;
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static pthread_t* threads = NULL;
+static pthread_attr_t attr;
+static int numThreads = 0;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ################## */
+static int count_characters(const char *str, char character)
+{
+    if (str == 0)
+        return 0;
+    const char *p = str;
+    int count = 0;
+
+    do {
+        if (*p == character)
+            count++;
+    } while (*(p++));
+
+    return count;
+}
+
+void* dummy_function(void* arg)
+{
+    return 0;
+}
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+
+
+int threads_test()
+{
+    int cnt = 0;
+    int err;
+    pthread_t pid;
+    int likwid_pin = count_characters(getenv("LIKWID_PIN"), ',');
+    int max_cpus = sysconf(_SC_NPROCESSORS_CONF);
+    int max = likwid_pin;
+    if (likwid_pin == 0)
+    {
+        max = max_cpus;
+    }
+    while (cnt < max) {
+        err = pthread_create(&pid, NULL, dummy_function, NULL);
+        cnt++;
+    }
+    return cnt;
+}
+
+
+void
+threads_init(int numberOfThreads)
+{
+    int i;
+    numThreads = numberOfThreads;
+
+    threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
+    threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
+
+    for(i = 0; i < numThreads; i++)
+    {
+        threads_data[i].numberOfThreads = numThreads;
+        threads_data[i].globalNumberOfThreads = numThreads;
+        threads_data[i].globalThreadId = i;
+        threads_data[i].threadId = i;
+    }
+
+    pthread_barrier_init(&threads_barrier, NULL, numThreads);
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+}
+
+
+void 
+threads_create(void *(*startRoutine)(void*))
+{
+    int i;
+
+    for(i = 0; i < numThreads; i++)
+    {
+        pthread_create(&threads[i],
+                &attr,
+                startRoutine,
+                (void*) &threads_data[i]);
+    }
+}
+
+void 
+threads_createGroups(int numberOfGroups)
+{
+    int i;
+    int j;
+    int numThreadsPerGroup;
+    int globalId = 0;
+
+    if (numThreads % numberOfGroups)
+    {
+        fprintf(stderr, "ERROR: Not enough threads %d to create %d groups\n",numThreads,numberOfGroups);
+    }
+    else 
+    {
+        numThreadsPerGroup = numThreads / numberOfGroups;
+    }
+
+    threads_groups = (ThreadGroup*) malloc(numberOfGroups * sizeof(ThreadGroup));
+    if (!threads_groups)
+    {
+        fprintf(stderr, "ERROR: Cannot allocate thread groups - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    for (i = 0; i < numberOfGroups; i++)
+    {
+        threads_groups[i].numberOfThreads = numThreadsPerGroup;
+        threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup * sizeof(int));
+        if (!threads_groups[i].threadIds)
+        {
+            fprintf(stderr, "ERROR: Cannot allocate threadID list for thread groups - %s\n", strerror(errno));
+            exit(EXIT_FAILURE);
+        }
+
+        for (j = 0; j < numThreadsPerGroup; j++)
+        {
+            threads_data[globalId].threadId = j;
+            threads_data[globalId].groupId = i;
+            threads_data[globalId].numberOfGroups = numberOfGroups;
+            threads_data[globalId].numberOfThreads = numThreadsPerGroup;
+            threads_groups[i].threadIds[j] = globalId++;
+        }
+    }
+}
+
+
+void 
+threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
+{
+    int i;
+
+    if (func == NULL)
+    {
+        for(i = 0; i < numThreads; i++)
+        {
+            threads_data[i].data = (*data);
+        }
+    }
+    else
+    {
+        for(i = 0; i < numThreads; i++)
+        {
+            func( data, &threads_data[i].data);
+        }
+    }
+}
+
+void
+threads_registerDataThread(int threadId,
+        ThreadUserData* data,
+        threads_copyDataFunc func)
+{
+    if (func == NULL)
+    {
+        threads_data[threadId].data = (*data);
+    }
+    else
+    {
+        func( data, &threads_data[threadId].data);
+    }
+}
+
+void
+threads_registerDataGroup(int groupId,
+        ThreadUserData* data,
+        threads_copyDataFunc func)
+{
+    int i;
+
+    if (func == NULL)
+    {
+        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+        {
+            threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
+        }
+    }
+    else
+    {
+        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+        {
+            func( data,
+                    &threads_data[threads_groups[groupId].threadIds[i]].data);
+        }
+    }
+}
+
+size_t
+threads_updateIterations(int groupId, size_t demandIter)
+{
+    int i = 0;
+    size_t iterations = threads_data[0].data.iter;
+    if (demandIter > 0)
+    {
+        iterations = demandIter;
+    }
+    iterations = (iterations < MIN_ITERATIONS ? MIN_ITERATIONS : iterations);
+
+    for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+    {
+        threads_data[threads_groups[groupId].threadIds[i]].data.iter = iterations;
+        threads_data[threads_groups[groupId].threadIds[i]].data.cycles = 0;
+        threads_data[threads_groups[groupId].threadIds[i]].cycles = 0;
+        threads_data[threads_groups[groupId].threadIds[i]].time = 0;
+    }
+    return iterations;
+}
+
+void
+threads_join(void)
+{
+    int i = 0;
+
+    for(i=0; i < numThreads; i++)
+    {
+        pthread_join(threads[i], NULL);
+    }
+}
+
+void
+threads_destroy(int numberOfGroups, int numberOfStreams)
+{
+    int i = 0, j = 0;
+    pthread_attr_destroy(&attr);
+    pthread_barrier_destroy(&threads_barrier);
+    
+    
+    for(i=0;i<numberOfGroups;i++)
+    {
+        for (j = 0; j < threads_groups[i].numberOfThreads; j++)
+        {
+            free(threads_data[threads_groups[i].threadIds[j]].data.processors);
+            free(threads_data[threads_groups[i].threadIds[j]].data.streams);
+        }
+        free(threads_groups[i].threadIds);
+    }
+    free(threads_groups);
+    free(threads);
+}
diff --git a/bench/x86-64/branch.ptt b/bench/x86-64/branch.ptt
deleted file mode 100644
index e15086d..0000000
--- a/bench/x86-64/branch.ptt
+++ /dev/null
@@ -1,36 +0,0 @@
-STREAMS 4
-TYPE DOUBLE_RAND
-FLOPS 2
-BYTES 32
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-cvtsd2si  GPR2, FPR1
-cmp		  GPR2, 0
-jl sub
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-addpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-addpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-addpd     FPR4, [STR3 + GPR1*8+48]
-jmp end
-sub:
-mulpd     FPR1, [STR2 + GPR1*8]
-subpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-subpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-subpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-subpd     FPR4, [STR3 + GPR1*8+48]
-end:
-movaps    [STR0 + GPR1*8], FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/clcopy.ptt b/bench/x86-64/clcopy.ptt
index b59c2be..3d95760 100644
--- a/bench/x86-64/clcopy.ptt
+++ b/bench/x86-64/clcopy.ptt
@@ -2,6 +2,12 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision cache line copy, only touches first element of each cache line.
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP  32
 movaps       FPR1, [STR0 + GPR1 * 8 ]
 movaps       FPR2, [STR0 + GPR1 * 8 + 64 ]
diff --git a/bench/x86-64/clload.ptt b/bench/x86-64/clload.ptt
index 8c3ddc2..7cd9c38 100644
--- a/bench/x86-64/clload.ptt
+++ b/bench/x86-64/clload.ptt
@@ -2,6 +2,12 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
+DESC Double-precision cache line load, only loads first element of each cache line.
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
 LOOP 32
 movaps       FPR1, [STR0 + GPR1 * 8]
 movaps       FPR2, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/clstore.ptt b/bench/x86-64/clstore.ptt
index 5541b8e..1b70c45 100644
--- a/bench/x86-64/clstore.ptt
+++ b/bench/x86-64/clstore.ptt
@@ -2,10 +2,16 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision cache line store, only stores first element of each cache line.
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
 LOOP 32
 movaps    [STR0 + GPR1 * 8], FPR1
 movaps    [STR0 + GPR1 * 8 + 64], FPR2
diff --git a/bench/x86-64/copy.ptt b/bench/x86-64/copy.ptt
index ffca4f5..b47e322 100644
--- a/bench/x86-64/copy.ptt
+++ b/bench/x86-64/copy.ptt
@@ -2,14 +2,20 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
-LOOP 8
-movaps    FPR1, [STR0 + GPR1 * 8]
-movaps    FPR2, [STR0 + GPR1 * 8 + 16]
-movaps    FPR3, [STR0 + GPR1 * 8 + 32]
-movaps    FPR4, [STR0 + GPR1 * 8 + 48]
-movaps    [STR1 + GPR1 * 8]     , FPR1
-movaps    [STR1 + GPR1 * 8 + 16], FPR2
-movaps    [STR1 + GPR1 * 8 + 32], FPR3
-movaps    [STR1 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector copy, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movsd    FPR1, [STR0 + GPR1 * 8]
+movsd    FPR2, [STR0 + GPR1 * 8 + 8]
+movsd    FPR3, [STR0 + GPR1 * 8 + 16]
+movsd    FPR4, [STR0 + GPR1 * 8 + 24]
+movsd    [STR1 + GPR1 * 8]     , FPR1
+movsd    [STR1 + GPR1 * 8 + 8] , FPR2
+movsd    [STR1 + GPR1 * 8 + 16], FPR3
+movsd    [STR1 + GPR1 * 8 + 24], FPR4
 
 
diff --git a/bench/x86-64/copy_avx.ptt b/bench/x86-64/copy_avx.ptt
index 814bb78..53b02b3 100644
--- a/bench/x86-64/copy_avx.ptt
+++ b/bench/x86-64/copy_avx.ptt
@@ -2,6 +2,12 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector copy, optimized for AVX
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP 16
 vmovaps    ymm1, [STR0 + GPR1 * 8]
 vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
diff --git a/bench/x86-64/copy_mem.ptt b/bench/x86-64/copy_mem.ptt
index fab5a66..3fa0b57 100644
--- a/bench/x86-64/copy_mem.ptt
+++ b/bench/x86-64/copy_mem.ptt
@@ -2,14 +2,20 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
-LOOP 8
-movaps    FPR1, [STR0 + GPR1 * 8]
-movaps    FPR2, [STR0 + GPR1 * 8 + 16]
-movaps    FPR3, [STR0 + GPR1 * 8 + 32]
-movaps    FPR4, [STR0 + GPR1 * 8 + 48]
-movntpd   [STR1 + GPR1 * 8]     , FPR1
-movntpd   [STR1 + GPR1 * 8 + 16], FPR2
-movntpd   [STR1 + GPR1 * 8 + 32], FPR3
-movntpd   [STR1 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector copy, only scalar operations but with non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movq    mm0, [STR0 + GPR1 * 8]
+movq    mm1, [STR0 + GPR1 * 8 + 8]
+movq    mm2, [STR0 + GPR1 * 8 + 16]
+movq    mm3, [STR0 + GPR1 * 8 + 24]
+movntq    [STR1 + GPR1 * 8]     , mm0
+movntq    [STR1 + GPR1 * 8 + 8] , mm1
+movntq    [STR1 + GPR1 * 8 + 16], mm2
+movntq    [STR1 + GPR1 * 8 + 24], mm3
 
 
diff --git a/bench/x86-64/copy_mem_avx.ptt b/bench/x86-64/copy_mem_avx.ptt
index 651a55e..3c393a4 100644
--- a/bench/x86-64/copy_mem_avx.ptt
+++ b/bench/x86-64/copy_mem_avx.ptt
@@ -2,7 +2,13 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
-LOOP 32
+DESC Double-precision vector copy, uses AVX and non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 16
 vmovaps    ymm1, [STR0 + GPR1 * 8]
 vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
 vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/copy_mem_sse.ptt b/bench/x86-64/copy_mem_sse.ptt
index f803bce..5a8c5d6 100644
--- a/bench/x86-64/copy_mem_sse.ptt
+++ b/bench/x86-64/copy_mem_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector copy, uses SSE and non-temporal stores
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP 8
 movaps    FPR1, [STR0 + GPR1 * 8]
 movaps    FPR2, [STR0 + GPR1 * 8 + 16]
diff --git a/bench/x86-64/copy_plain.ptt b/bench/x86-64/copy_plain.ptt
deleted file mode 100644
index 4fcbbbc..0000000
--- a/bench/x86-64/copy_plain.ptt
+++ /dev/null
@@ -1,16 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 0
-BYTES 16
-LOOP 4
-movsd    FPR1, [STR0 + GPR1 * 8]
-movsd    FPR2, [STR0 + GPR1 * 8 + 8]
-movsd    FPR3, [STR0 + GPR1 * 8 + 16]
-movsd    FPR4, [STR0 + GPR1 * 8 + 24]
-movsd    [STR1 + GPR1 * 8]     , FPR1
-movsd    [STR1 + GPR1 * 8 + 8] , FPR2
-movsd    [STR1 + GPR1 * 8 + 16], FPR3
-movsd    [STR1 + GPR1 * 8 + 24], FPR4
-
-
-
diff --git a/bench/x86-64/copy_sse.ptt b/bench/x86-64/copy_sse.ptt
index ffca4f5..75aaee4 100644
--- a/bench/x86-64/copy_sse.ptt
+++ b/bench/x86-64/copy_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector copy, optimized for SSE
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP 8
 movaps    FPR1, [STR0 + GPR1 * 8]
 movaps    FPR2, [STR0 + GPR1 * 8 + 16]
diff --git a/bench/x86-64/daxpy.ptt b/bench/x86-64/daxpy.ptt
new file mode 100644
index 0000000..fae8bbf
--- /dev/null
+++ b/bench/x86-64/daxpy.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movsd FPR7, [rip+SCALAR]
+LOOP 4
+movsd    FPR1, [STR0 + GPR1*8]
+movsd    FPR2, [STR0 + GPR1*8 + 8]
+mulsd    FPR1, FPR7
+mulsd    FPR2, FPR7
+movsd    FPR3, [STR0 + GPR1*8 + 16]
+movsd    FPR4, [STR0 + GPR1*8 + 24]
+mulsd    FPR3, FPR7
+mulsd    FPR4, FPR7
+addsd    FPR1, [STR1 + GPR1*8]
+addsd    FPR2, [STR1 + GPR1*8 + 8]
+addsd    FPR3, [STR1 + GPR1*8 + 16]
+addsd    FPR4, [STR1 + GPR1*8 + 24]
+movsd    [STR1 + GPR1*8], FPR1
+movsd    [STR1 + GPR1*8 + 8], FPR2
+movsd    [STR1 + GPR1*8 + 16], FPR3
+movsd    [STR1 + GPR1*8 + 24], FPR4
diff --git a/bench/x86-64/daxpy_avx.ptt b/bench/x86-64/daxpy_avx.ptt
new file mode 100644
index 0000000..7b2ecd8
--- /dev/null
+++ b/bench/x86-64/daxpy_avx.ptt
@@ -0,0 +1,31 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 21
+UOPS 38
+vmovaps ymm7, [rip+SCALAR]
+LOOP 24
+vmulpd    ymm1, ymm7, [STR0 + GPR1*8]
+vaddpd    ymm1, ymm1, [STR1 + GPR1*8]
+vmulpd    ymm2, ymm7, [STR0 + GPR1*8+32]
+vaddpd    ymm2, ymm2, [STR1 + GPR1*8+32]
+vmovaps    [STR1 + GPR1*8], ymm1
+vmovaps    [STR1 + GPR1*8+32], ymm2
+vmulpd    ymm3, ymm7, [STR0 + GPR1*8+64]
+vaddpd    ymm3, ymm3, [STR1 + GPR1*8+64]
+vmulpd    ymm4, ymm7, [STR0 + GPR1*8+96]
+vaddpd    ymm4, ymm4, [STR1 + GPR1*8+96]
+vmovaps    [STR1 + GPR1*8+64], ymm3
+vmovaps    [STR1 + GPR1*8+96], ymm4
+vmulpd    ymm5, ymm7, [STR0 + GPR1*8+128]
+vaddpd    ymm5, ymm5, [STR1 + GPR1*8+128]
+vmulpd    ymm6, ymm7, [STR0 + GPR1*8+160]
+vaddpd    ymm6, ymm6, [STR1 + GPR1*8+160]
+vmovaps    [STR1 + GPR1*8+128], ymm5
+vmovaps    [STR1 + GPR1*8+160], ymm6
+
diff --git a/bench/x86-64/daxpy_avx_fma.ptt b/bench/x86-64/daxpy_avx_fma.ptt
new file mode 100644
index 0000000..8a77482
--- /dev/null
+++ b/bench/x86-64/daxpy_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR0 + GPR1*8]
+vmovaps    ymm2, [STR0 + GPR1*8+32]
+vmovaps    ymm3, [STR0 + GPR1*8+64]
+vmovaps    ymm4, [STR0 + GPR1*8+96]
+vfmadd213pd ymm1, ymm7, [STR1 + GPR1*8]
+vfmadd213pd ymm2, ymm7, [STR1 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR1 + GPR1*8+64]
+vfmadd213pd ymm4, ymm7, [STR1 + GPR1*8+96]
+vmovaps    [STR1 + GPR1*8], ymm1
+vmovaps    [STR1 + GPR1*8+32], ymm2
+vmovaps    [STR1 + GPR1*8+64], ymm3
+vmovaps    [STR1 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/daxpy_mem_avx.ptt b/bench/x86-64/daxpy_mem_avx.ptt
new file mode 100644
index 0000000..fbbee94
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_avx.ptt
@@ -0,0 +1,30 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 21
+UOPS 38
+vmovaps ymm7, [rip+SCALAR]
+LOOP 24
+vmulpd    ymm1, ymm7, [STR0 + GPR1*8]
+vaddpd    ymm1, ymm1, [STR1 + GPR1*8]
+vmulpd    ymm2, ymm7, [STR0 + GPR1*8+32]
+vaddpd    ymm2, ymm2, [STR1 + GPR1*8+32]
+vmovntps    [STR1 + GPR1*8], ymm1
+vmovntps    [STR1 + GPR1*8+32], ymm2
+vmulpd    ymm3, ymm7, [STR0 + GPR1*8+64]
+vaddpd    ymm3, ymm3, [STR1 + GPR1*8+64]
+vmulpd    ymm4, ymm7, [STR0 + GPR1*8+96]
+vaddpd    ymm4, ymm4, [STR1 + GPR1*8+96]
+vmovntps    [STR1 + GPR1*8+64], ymm3
+vmovntps    [STR1 + GPR1*8+96], ymm4
+vmulpd    ymm5, ymm7, [STR0 + GPR1*8+128]
+vaddpd    ymm5, ymm5, [STR1 + GPR1*8+128]
+vmulpd    ymm6, ymm7, [STR0 + GPR1*8+160]
+vaddpd    ymm6, ymm6, [STR1 + GPR1*8+160]
+vmovntps    [STR1 + GPR1*8+128], ymm5
+vmovntps    [STR1 + GPR1*8+160], ymm6
diff --git a/bench/x86-64/daxpy_mem_avx_fma.ptt b/bench/x86-64/daxpy_mem_avx_fma.ptt
new file mode 100644
index 0000000..1c7e434
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR0 + GPR1*8]
+vmovaps    ymm2, [STR0 + GPR1*8+32]
+vmovaps    ymm3, [STR0 + GPR1*8+64]
+vmovaps    ymm4, [STR0 + GPR1*8+96]
+vfmadd213pd ymm1, ymm7, [STR1 + GPR1*8]
+vfmadd213pd ymm2, ymm7, [STR1 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR1 + GPR1*8+64]
+vfmadd213pd ymm4, ymm7, [STR1 + GPR1*8+96]
+vmovntps    [STR1 + GPR1*8], ymm1
+vmovntps    [STR1 + GPR1*8+32], ymm2
+vmovntps    [STR1 + GPR1*8+64], ymm3
+vmovntps    [STR1 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/daxpy_mem_sse.ptt b/bench/x86-64/daxpy_mem_sse.ptt
new file mode 100644
index 0000000..37d538e
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps   FPR1, [STR0 + GPR1*8]
+movaps   FPR2, [STR0 + GPR1*8+16]
+movaps   FPR3, [STR0 + GPR1*8+32]
+movaps   FPR4, [STR0 + GPR1*8+48]
+mulpd    FPR1, FPR7
+addpd    FPR1, [STR1 + GPR1*8]
+mulpd    FPR2, FPR7
+addpd    FPR2, [STR1 + GPR1*8+16]
+mulpd    FPR3, FPR7
+addpd    FPR3, [STR1 + GPR1*8+32]
+mulpd    FPR4, FPR7
+addpd    FPR4, [STR1 + GPR1*8+48]
+vmovntps   [STR1 + GPR1*8], FPR1
+vmovntps   [STR1 + GPR1*8+16], FPR2
+vmovntps   [STR1 + GPR1*8+32], FPR3
+vmovntps   [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_mem_sse_fma.ptt b/bench/x86-64/daxpy_mem_sse_fma.ptt
new file mode 100644
index 0000000..ba90537
--- /dev/null
+++ b/bench/x86-64/daxpy_mem_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE FMAs and non temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps    FPR1, [STR0 + GPR1*8]
+movaps    FPR2, [STR0 + GPR1*8+16]
+movaps    FPR3, [STR0 + GPR1*8+32]
+movaps    FPR4, [STR0 + GPR1*8+48]
+vfmadd213pd FPR1, FPR7, [STR1 + GPR1*8]
+vfmadd213pd FPR2, FPR7, [STR1 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR1 + GPR1*8+32]
+vfmadd213pd FPR4, FPR7, [STR1 + GPR1*8+48]
+movntps    [STR1 + GPR1*8], FPR1
+movntps    [STR1 + GPR1*8+16], FPR2
+movntps    [STR1 + GPR1*8+32], FPR3
+movntps    [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_sp.ptt b/bench/x86-64/daxpy_sp.ptt
new file mode 100644
index 0000000..3f4a326
--- /dev/null
+++ b/bench/x86-64/daxpy_sp.ptt
@@ -0,0 +1,44 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 32
+UOPS 50
+movss FPR9, [rip+SCALAR]
+LOOP 8
+movss    FPR1, [STR0 + GPR1*4]
+movss    FPR2, [STR0 + GPR1*4 + 4]
+movss    FPR3, [STR0 + GPR1*4 + 8]
+movss    FPR4, [STR0 + GPR1*4 + 12]
+mulss    FPR1, FPR9
+addss    FPR1, [STR1 + GPR1*4]
+mulss    FPR2, FPR9
+addss    FPR2, [STR1 + GPR1*4 + 4]
+mulss    FPR3, FPR9
+addss    FPR3, [STR1 + GPR1*4 + 8]
+mulss    FPR4, FPR9
+addss    FPR4, [STR1 + GPR1*4 + 12]
+movss    FPR5, [STR0 + GPR1*4 + 16]
+movss    FPR6, [STR0 + GPR1*4 + 20]
+movss    FPR7, [STR0 + GPR1*4 + 24]
+movss    FPR8, [STR0 + GPR1*4 + 28]
+mulss    FPR5, FPR9
+addss    FPR5, [STR1 + GPR1*4 + 16]
+mulss    FPR6, FPR9
+addss    FPR6, [STR1 + GPR1*4 + 20]
+mulss    FPR7, FPR9
+addss    FPR7, [STR1 + GPR1*4 + 24]
+mulss    FPR8, FPR9
+addss    FPR8, [STR1 + GPR1*4 + 28]
+movss    [STR1 + GPR1*4], FPR1
+movss    [STR1 + GPR1*4 + 4], FPR2
+movss    [STR1 + GPR1*4 + 8], FPR3
+movss    [STR1 + GPR1*4 + 12], FPR4
+movss    [STR1 + GPR1*4 + 16], FPR5
+movss    [STR1 + GPR1*4 + 20], FPR6
+movss    [STR1 + GPR1*4 + 24], FPR7
+movss    [STR1 + GPR1*4 + 28], FPR8
diff --git a/bench/x86-64/daxpy_sp_avx.ptt b/bench/x86-64/daxpy_sp_avx.ptt
new file mode 100644
index 0000000..4602982
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 9
+UOPS 14
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmulps    ymm1, ymm7, [STR0 + GPR1*4]
+vaddps    ymm1, ymm1, [STR1 + GPR1*4]
+vmovaps    [STR1 + GPR1*4], ymm1
+vmulps    ymm2, ymm7, [STR0 + GPR1*4+32]
+vaddps    ymm2, ymm2, [STR1 + GPR1*4+32]
+vmovaps    [STR1 + GPR1*4+32], ymm2
+
diff --git a/bench/x86-64/daxpy_sp_avx_fma.ptt b/bench/x86-64/daxpy_sp_avx_fma.ptt
new file mode 100644
index 0000000..f5216a1
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps   ymm1, [STR0 + GPR1*4]
+vmovaps   ymm2, [STR0 + GPR1*4+32]
+vmovaps   ymm3, [STR0 + GPR1*4+64]
+vmovaps   ymm4, [STR0 + GPR1*4+96]
+vfmadd213ps ymm1, ymm7, [STR1 + GPR1*4]
+vfmadd213ps ymm2, ymm7, [STR1 + GPR1*4+32]
+vfmadd213ps ymm3, ymm7, [STR1 + GPR1*4+64]
+vfmadd213ps ymm4, ymm7, [STR1 + GPR1*4+96]
+vmovaps    [STR1 + GPR1*4], ymm1
+vmovaps    [STR1 + GPR1*4+32], ymm2
+vmovaps    [STR1 + GPR1*4+64], ymm3
+vmovaps    [STR1 + GPR1*4+96], ymm4
+
diff --git a/bench/x86-64/daxpy_sp_mem_avx.ptt b/bench/x86-64/daxpy_sp_mem_avx.ptt
new file mode 100644
index 0000000..0f26304
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 9
+UOPS 14
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmulps    ymm1, ymm7, [STR0 + GPR1*4]
+vaddps    ymm1, ymm1, [STR1 + GPR1*4]
+vmovntps    [STR1 + GPR1*4], ymm1
+vmulps    ymm2, ymm7, [STR0 + GPR1*4+32]
+vaddps    ymm2, ymm2, [STR1 + GPR1*4+32]
+vmovntps    [STR1 + GPR1*4+32], ymm2
+
diff --git a/bench/x86-64/daxpy_sp_mem_avx_fma.ptt b/bench/x86-64/daxpy_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..4c316d4
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_avx_fma.ptt
@@ -0,0 +1,25 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm7, [rip+SCALAR]
+LOOP 16
+vmovaps   ymm1, [STR0 + GPR1*4]
+vmovaps   ymm2, [STR0 + GPR1*4+32]
+vmovaps   ymm3, [STR0 + GPR1*4+64]
+vmovaps   ymm4, [STR0 + GPR1*4+96]
+vfmadd213ps ymm1, ymm7, [STR1 + GPR1*4]
+vfmadd213ps ymm2, ymm7, [STR1 + GPR1*4+32]
+vfmadd213ps ymm3, ymm7, [STR1 + GPR1*4+64]
+vfmadd213ps ymm4, ymm7, [STR1 + GPR1*4+96]
+vmovntps    [STR1 + GPR1*4], ymm1
+vmovntps    [STR1 + GPR1*4+32], ymm2
+vmovntps    [STR1 + GPR1*4+64], ymm3
+vmovntps    [STR1 + GPR1*4+96], ymm4
+
diff --git a/bench/x86-64/daxpy_sp_mem_sse.ptt b/bench/x86-64/daxpy_sp_mem_sse.ptt
new file mode 100644
index 0000000..a9e6ec5
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_sse.ptt
@@ -0,0 +1,20 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 11
+UOPS 14
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps   FPR1, [STR0 + GPR1*4]
+mulps    FPR1, FPR7
+addps    FPR1, [STR1 + GPR1*4]
+movntps   [STR1 + GPR1*4], FPR1
+movaps   FPR2, [STR0 + GPR1*4+16]
+mulps    FPR2, FPR7
+addps    FPR2, [STR1 + GPR1*4+16]
+movntps   [STR1 + GPR1*4+16], FPR2
diff --git a/bench/x86-64/daxpy_sp_mem_sse_fma.ptt b/bench/x86-64/daxpy_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..ee85f28
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_mem_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 32
+movaps   FPR1, [STR0 + GPR1*4]
+movaps   FPR2, [STR0 + GPR1*4+32]
+movaps   FPR3, [STR0 + GPR1*4+64]
+movaps   FPR4, [STR0 + GPR1*4+96]
+vfmadd213ps FPR1, FPR7, [STR1 + GPR1*4]
+vfmadd213ps FPR2, FPR7, [STR1 + GPR1*4+32]
+vfmadd213ps FPR3, FPR7, [STR1 + GPR1*4+64]
+vfmadd213ps FPR4, FPR7, [STR1 + GPR1*4+96]
+movntps   [STR1 + GPR1*4], FPR1
+movntps   [STR1 + GPR1*4+32], FPR2
+movntps   [STR1 + GPR1*4+64], FPR3
+movntps   [STR1 + GPR1*4+96], FPR4
diff --git a/bench/x86-64/daxpy_sp_sse.ptt b/bench/x86-64/daxpy_sp_sse.ptt
new file mode 100644
index 0000000..fc20441
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps   FPR1, [STR0 + GPR1*4]
+movaps   FPR2, [STR0 + GPR1*4+16]
+movaps   FPR3, [STR0 + GPR1*4+32]
+movaps   FPR4, [STR0 + GPR1*4+48]
+mulps    FPR1, FPR7
+addps    FPR1, [STR1 + GPR1*4]
+mulps    FPR2, FPR7
+addps    FPR2, [STR1 + GPR1*4+16]
+mulps    FPR3, FPR7
+addps    FPR3, [STR1 + GPR1*4+32]
+mulps    FPR4, FPR7
+addps    FPR4, [STR1 + GPR1*4+48]
+movaps   [STR1 + GPR1*4], FPR1
+movaps   [STR1 + GPR1*4+16], FPR2
+movaps   [STR1 + GPR1*4+32], FPR3
+movaps   [STR1 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/daxpy_sp_sse_fma.ptt b/bench/x86-64/daxpy_sp_sse_fma.ptt
new file mode 100644
index 0000000..148d750
--- /dev/null
+++ b/bench/x86-64/daxpy_sp_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision linear combination of two vectors, optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 32
+movaps   FPR1, [STR0 + GPR1*4]
+movaps   FPR2, [STR0 + GPR1*4+32]
+movaps   FPR3, [STR0 + GPR1*4+64]
+movaps   FPR4, [STR0 + GPR1*4+96]
+vfmadd213ps FPR1, FPR7, [STR1 + GPR1*4]
+vfmadd213ps FPR2, FPR7, [STR1 + GPR1*4+32]
+vfmadd213ps FPR3, FPR7, [STR1 + GPR1*4+64]
+vfmadd213ps FPR4, FPR7, [STR1 + GPR1*4+96]
+movaps   [STR1 + GPR1*4], FPR1
+movaps   [STR1 + GPR1*4+32], FPR2
+movaps   [STR1 + GPR1*4+64], FPR3
+movaps   [STR1 + GPR1*4+96], FPR4
diff --git a/bench/x86-64/daxpy_sse.ptt b/bench/x86-64/daxpy_sse.ptt
new file mode 100644
index 0000000..747aed6
--- /dev/null
+++ b/bench/x86-64/daxpy_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps   FPR1, [STR0 + GPR1*8]
+movaps   FPR2, [STR0 + GPR1*8+16]
+movaps   FPR3, [STR0 + GPR1*8+32]
+movaps   FPR4, [STR0 + GPR1*8+48]
+mulpd    FPR1, FPR7
+addpd    FPR1, [STR1 + GPR1*8]
+mulpd    FPR2, FPR7
+addpd    FPR2, [STR1 + GPR1*8+16]
+mulpd    FPR3, FPR7
+addpd    FPR3, [STR1 + GPR1*8+32]
+mulpd    FPR4, FPR7
+addpd    FPR4, [STR1 + GPR1*8+48]
+movaps   [STR1 + GPR1*8], FPR1
+movaps   [STR1 + GPR1*8+16], FPR2
+movaps   [STR1 + GPR1*8+32], FPR3
+movaps   [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/daxpy_sse_fma.ptt b/bench/x86-64/daxpy_sse_fma.ptt
new file mode 100644
index 0000000..21c022f
--- /dev/null
+++ b/bench/x86-64/daxpy_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR7, [rip+SCALAR]
+LOOP 8
+movaps    FPR1, [STR0 + GPR1*8]
+movaps    FPR2, [STR0 + GPR1*8+16]
+movaps    FPR3, [STR0 + GPR1*8+32]
+movaps    FPR4, [STR0 + GPR1*8+48]
+vfmadd213pd FPR1, FPR7, [STR1 + GPR1*8]
+vfmadd213pd FPR2, FPR7, [STR1 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR1 + GPR1*8+32]
+vfmadd213pd FPR4, FPR7, [STR1 + GPR1*8+48]
+movaps    [STR1 + GPR1*8], FPR1
+movaps    [STR1 + GPR1*8+16], FPR2
+movaps    [STR1 + GPR1*8+32], FPR3
+movaps    [STR1 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/ddot.ptt b/bench/x86-64/ddot.ptt
new file mode 100644
index 0000000..e438c49
--- /dev/null
+++ b/bench/x86-64/ddot.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, only scalar operations
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorpd FPR1, FPR1
+xorpd FPR6, FPR6
+xorpd FPR7, FPR7
+xorpd FPR8, FPR8
+LOOP 4
+movsd FPR2, [STR0 + GPR1 * 8]
+mulsd FPR2, [STR1 + GPR1 * 8]
+addsd FPR1, FPR2
+movsd FPR3, [STR0 + GPR1 * 8 + 8]
+mulsd FPR3, [STR1 + GPR1 * 8 + 8]
+addsd FPR6, FPR3
+movsd FPR4, [STR0 + GPR1 * 8 + 16]
+mulsd FPR4, [STR1 + GPR1 * 8 + 16]
+addsd FPR7, FPR4
+movsd FPR5, [STR0 + GPR1 * 8 + 24]
+mulsd FPR5, [STR1 + GPR1 * 8 + 24]
+addsd FPR8, FPR5
diff --git a/bench/x86-64/ddot_avx.ptt b/bench/x86-64/ddot_avx.ptt
new file mode 100644
index 0000000..24dc330
--- /dev/null
+++ b/bench/x86-64/ddot_avx.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, optimized for AVX
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+vxorpd ymm0, ymm0, ymm0
+vxorpd ymm5, ymm5, ymm5
+vxorpd ymm6, ymm6, ymm6
+vxorpd ymm7, ymm7, ymm7
+LOOP 16
+vmovaps ymm1,       [STR0 + GPR1 * 8]
+vmulpd  ymm1, ymm1, [STR1 + GPR1 * 8]
+vaddpd  ymm0, ymm0, ymm1
+vmovaps ymm2,       [STR0 + GPR1 * 8 + 32]
+vmulpd  ymm2, ymm2, [STR1 + GPR1 * 8 + 32]
+vaddpd  ymm5, ymm5, ymm2
+vmovaps ymm3,       [STR0 + GPR1 * 8 + 64]
+vmulpd  ymm3, ymm3, [STR1 + GPR1 * 8 + 64]
+vaddpd  ymm6, ymm6, ymm3
+vmovaps ymm4,       [STR0 + GPR1 * 8 + 96]
+vmulpd  ymm4, ymm4, [STR1 + GPR1 * 8 + 96]
+vaddpd  ymm7, ymm7, ymm4
diff --git a/bench/x86-64/ddot_sp.ptt b/bench/x86-64/ddot_sp.ptt
new file mode 100644
index 0000000..4a108b9
--- /dev/null
+++ b/bench/x86-64/ddot_sp.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, only scalar operations
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorps FPR1, FPR1
+xorps FPR6, FPR6
+xorps FPR7, FPR7
+xorps FPR8, FPR8
+LOOP 4
+movss FPR2, [STR0 + GPR1 * 4]
+mulss FPR2, [STR1 + GPR1 * 4]
+addss FPR1, FPR2
+movss FPR3, [STR0 + GPR1 * 4 + 4]
+mulss FPR3, [STR1 + GPR1 * 4 + 4]
+addss FPR6, FPR3
+movss FPR4, [STR0 + GPR1 * 4 + 8]
+mulss FPR4, [STR1 + GPR1 * 4 + 8]
+addss FPR7, FPR4
+movss FPR5, [STR0 + GPR1 * 4 + 12]
+mulss FPR5, [STR1 + GPR1 * 4 + 12]
+addss FPR8, FPR5
diff --git a/bench/x86-64/ddot_sp_avx.ptt b/bench/x86-64/ddot_sp_avx.ptt
new file mode 100644
index 0000000..45afad6
--- /dev/null
+++ b/bench/x86-64/ddot_sp_avx.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, optimized for AVX
+LOADS 2
+STORES 0
+INSTR_CONST 18
+INSTR_LOOP 9
+UOPS 10
+vxorps ymm0, ymm0, ymm0
+vxorps ymm3, ymm3, ymm3
+LOOP 16
+vmovaps ymm1,       [STR0 + GPR1 * 4]
+vmulps  ymm1, ymm1, [STR1 + GPR1 * 4]
+vaddps  ymm0, ymm0, ymm1
+vmovaps ymm2,       [STR0 + GPR1 * 4 + 32]
+vmulps  ymm2, ymm2, [STR1 + GPR1 * 4 + 32]
+vaddps  ymm3, ymm3, ymm2
diff --git a/bench/x86-64/ddot_sp_sse.ptt b/bench/x86-64/ddot_sp_sse.ptt
new file mode 100644
index 0000000..b445cb3
--- /dev/null
+++ b/bench/x86-64/ddot_sp_sse.ptt
@@ -0,0 +1,19 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+DESC Single-precision dot product of two vectors, optimized for SSE
+LOADS 2
+STORES 0
+INSTR_CONST 18
+INSTR_LOOP 9
+UOPS 10
+xorps FPR1, FPR1
+xorps FPR4, FPR4
+LOOP 8
+movaps FPR2, [STR0 + GPR1 * 4]
+mulps  FPR2, [STR1 + GPR1 * 4]
+addps  FPR1, FPR2
+movaps FPR3, [STR0 + GPR1 * 4 + 16]
+mulps  FPR3, [STR1 + GPR1 * 4 + 16]
+addps  FPR4, FPR3
diff --git a/bench/x86-64/ddot_sse.ptt b/bench/x86-64/ddot_sse.ptt
new file mode 100644
index 0000000..42ff526
--- /dev/null
+++ b/bench/x86-64/ddot_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+DESC Double-precision dot product of two vectors, optimized for SSE
+LOADS 2
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 15
+UOPS 18
+xorpd FPR1, FPR1
+xorpd FPR6, FPR6
+xorpd FPR7, FPR7
+xorpd FPR8, FPR8
+LOOP 8
+movaps FPR2, [STR0 + GPR1 * 8]
+mulpd  FPR2, [STR1 + GPR1 * 8]
+addpd  FPR1, FPR2
+movaps FPR3, [STR0 + GPR1 * 8 + 16]
+mulpd  FPR3, [STR1 + GPR1 * 8 + 16]
+addpd  FPR6, FPR3
+movaps FPR4, [STR0 + GPR1 * 8 + 32]
+mulpd  FPR4, [STR1 + GPR1 * 8 + 32]
+addpd  FPR7, FPR4
+movaps FPR5, [STR0 + GPR1 * 8 + 48]
+mulpd  FPR5, [STR1 + GPR1 * 8 + 48]
+addpd  FPR8, FPR5
diff --git a/bench/x86-64/load.ptt b/bench/x86-64/load.ptt
index 36aaab1..eb1b954 100644
--- a/bench/x86-64/load.ptt
+++ b/bench/x86-64/load.ptt
@@ -2,11 +2,19 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
+DESC Double-precision load, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 10
 LOOP 8
-mov       GPR12, [STR0 + GPR1 * 8 + 256]
-movaps    FPR1, [STR0 + GPR1 * 8]
-movaps    FPR2, [STR0 + GPR1 * 8 + 16]
-movaps    FPR3, [STR0 + GPR1 * 8 + 32]
-movaps    FPR4, [STR0 + GPR1 * 8 + 48]
-
+movsd    FPR1, [STR0 + GPR1 * 8]
+movsd    FPR2, [STR0 + GPR1 * 8 + 8]
+movsd    FPR3, [STR0 + GPR1 * 8 + 16]
+movsd    FPR4, [STR0 + GPR1 * 8 + 24]
+movsd    FPR5, [STR0 + GPR1 * 8 + 32]
+movsd    FPR6, [STR0 + GPR1 * 8 + 40]
+movsd    FPR7, [STR0 + GPR1 * 8 + 48]
+movsd    FPR8, [STR0 + GPR1 * 8 + 56]
 
diff --git a/bench/x86-64/load_avx.ptt b/bench/x86-64/load_avx.ptt
index 93b45c7..7ce7989 100644
--- a/bench/x86-64/load_avx.ptt
+++ b/bench/x86-64/load_avx.ptt
@@ -2,8 +2,14 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
+DESC Double-precision load, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
 LOOP 16
-mov        GPR12, [STR0 + GPR1 * 8 + 256]
+#mov        GPR12, [STR0 + GPR1 * 8 + 256]
 vmovaps    ymm1, [STR0 + GPR1 * 8]
 vmovaps    ymm2, [STR0 + GPR1 * 8 + 32]
 vmovaps    ymm3, [STR0 + GPR1 * 8 + 64]
diff --git a/bench/x86-64/load_mem.ptt b/bench/x86-64/load_mem.ptt
new file mode 100644
index 0000000..06b0440
--- /dev/null
+++ b/bench/x86-64/load_mem.ptt
@@ -0,0 +1,15 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+DESC Double-precision load, using non-temporal loads
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
+LOOP 8
+MOVNTDQA    FPR1, [STR0 + GPR1 * 8]
+MOVNTDQA    FPR2, [STR0 + GPR1 * 8 + 16]
+MOVNTDQA    FPR3, [STR0 + GPR1 * 8 + 32]
+MOVNTDQA    FPR4, [STR0 + GPR1 * 8 + 48]
diff --git a/bench/x86-64/load_plain.ptt b/bench/x86-64/load_plain.ptt
deleted file mode 100644
index be6d21c..0000000
--- a/bench/x86-64/load_plain.ptt
+++ /dev/null
@@ -1,12 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 8
-LOOP 4
-mov       GPR12, [STR0 + GPR1 * 8 + 256]
-movsd    FPR1, [STR0 + GPR1 * 8]
-movsd    FPR2, [STR0 + GPR1 * 8 + 8]
-movsd    FPR3, [STR0 + GPR1 * 8 + 16]
-movsd    FPR4, [STR0 + GPR1 * 8 + 24]
-
-
diff --git a/bench/x86-64/load_sse.ptt b/bench/x86-64/load_sse.ptt
index 36aaab1..fa95f51 100644
--- a/bench/x86-64/load_sse.ptt
+++ b/bench/x86-64/load_sse.ptt
@@ -2,8 +2,14 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
+DESC Double-precision load, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 6
 LOOP 8
-mov       GPR12, [STR0 + GPR1 * 8 + 256]
+#mov       GPR12, [STR0 + GPR1 * 8 + 256]
 movaps    FPR1, [STR0 + GPR1 * 8]
 movaps    FPR2, [STR0 + GPR1 * 8 + 16]
 movaps    FPR3, [STR0 + GPR1 * 8 + 32]
diff --git a/bench/x86-64/peak.ptt b/bench/x86-64/peak.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 16
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR6, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-pshufd    FPR2, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8], FPR2
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-pshufd    FPR3, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 16], FPR3
-movaps    FPR4, [STR0 + GPR1 * 8 + 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-pshufd    FPR4, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 32], FPR4
-movaps    FPR5, [STR0 + GPR1 * 8 + 48]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-movaps    FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-pshufd    FPR5, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peak_avx.ptt b/bench/x86-64/peak_avx.ptt
deleted file mode 100644
index 047178e..0000000
--- a/bench/x86-64/peak_avx.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub  GPR2, 8
-sub  STR0, 64
-sub  STR1, 64
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-vmovaps    ymm2, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vmovaps    ymm6, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-#vpshufd    ymm2, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8], ymm2
-vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vmovaps    ymm7, [STR0 + GPR1 * 8 + 32 ]
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-#vpshufd    ymm3, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 32], ymm3
-vmovaps    ymm4, [STR0 + GPR1 * 8 + 64]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vmovaps    ymm8, [STR0 + GPR1 * 8 + 64 ]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-#vpshufd    ymm4, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 32], ymm4
-vmovaps    ymm5, [STR0 + GPR1 * 8 + 96]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-vmovaps    ymm9, [STR0 + GPR1 * 8 + 96]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-#vpshufd    ymm5, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 96], ymm5
-add GPR1, 16
-js 1b
-
-
diff --git a/bench/x86-64/peak_sse.ptt b/bench/x86-64/peak_sse.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak_sse.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 16
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR6, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-pshufd    FPR2, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8], FPR2
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-pshufd    FPR3, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 16], FPR3
-movaps    FPR4, [STR0 + GPR1 * 8 + 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-pshufd    FPR4, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 32], FPR4
-movaps    FPR5, [STR0 + GPR1 * 8 + 48]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-movaps    FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-pshufd    FPR5, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peakflops.ptt b/bench/x86-64/peakflops.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR4, [STR0 + GPR1 * 8 - 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR5, [STR0 + GPR1 * 8 - 16]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_avx.ptt b/bench/x86-64/peakflops_avx.ptt
deleted file mode 100644
index d9f9885..0000000
--- a/bench/x86-64/peakflops_avx.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub  GPR2, 8
-sub  STR0, 64
-sub  STR1, 64
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-vmovaps    ymm2, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
-add GPR1, 16
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vmovaps    ymm4, [STR0 + GPR1 * 8 - 64]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vmovaps    ymm5, [STR0 + GPR1 * 8 - 32]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_sse.ptt b/bench/x86-64/peakflops_sse.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops_sse.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR4, [STR0 + GPR1 * 8 - 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR5, [STR0 + GPR1 * 8 - 16]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/store.ptt b/bench/x86-64/store.ptt
index 4ef9ab9..196f9dc 100644
--- a/bench/x86-64/store.ptt
+++ b/bench/x86-64/store.ptt
@@ -1,15 +1,20 @@
 STREAMS 1
 TYPE DOUBLE
 FLOPS 0
+DESC Double-precision store, only scalar operations
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
-LOOP 8
-#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
-movaps    [STR0 + GPR1 * 8]     , FPR1
-movaps    [STR0 + GPR1 * 8 + 16], FPR2
-movaps    [STR0 + GPR1 * 8 + 32], FPR3
-movaps    [STR0 + GPR1 * 8 + 48], FPR4
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movsd FPR1, [rip+SCALAR]
+movsd FPR2, [rip+SCALAR]
+movsd FPR3, [rip+SCALAR]
+movsd FPR4, [rip+SCALAR]
+LOOP 4
+movsd    [STR0 + GPR1 * 8]     , FPR1
+movsd    [STR0 + GPR1 * 8 + 8] , FPR2
+movsd    [STR0 + GPR1 * 8 + 16], FPR3
+movsd    [STR0 + GPR1 * 8 + 24], FPR4
 
diff --git a/bench/x86-64/store_avx.ptt b/bench/x86-64/store_avx.ptt
index 7b589a8..71ba7e1 100644
--- a/bench/x86-64/store_avx.ptt
+++ b/bench/x86-64/store_avx.ptt
@@ -2,12 +2,17 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-vmovaps ymm1, [SCALAR]
-vmovaps ymm2, [SCALAR]
-vmovaps ymm3, [SCALAR]
-vmovaps ymm4, [SCALAR]
+DESC Double-precision store, optimized for AVX
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vmovaps ymm1, [rip+SCALAR]
+vmovaps ymm2, [rip+SCALAR]
+vmovaps ymm3, [rip+SCALAR]
+vmovaps ymm4, [rip+SCALAR]
 LOOP 16
-#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
 vmovaps    [STR0 + GPR1 * 8]     , ymm1
 vmovaps    [STR0 + GPR1 * 8 + 32], ymm2
 vmovaps    [STR0 + GPR1 * 8 + 64], ymm3
diff --git a/bench/x86-64/store_mem.ptt b/bench/x86-64/store_mem.ptt
index 0a0222d..4b511c0 100644
--- a/bench/x86-64/store_mem.ptt
+++ b/bench/x86-64/store_mem.ptt
@@ -1,11 +1,17 @@
 STREAMS 1
 TYPE DOUBLE
 FLOPS 0
+DESC Double-precision store, uses non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
 LOOP 8
 movntpd    [STR0 + GPR1 * 8]     , FPR1
 movntpd    [STR0 + GPR1 * 8 + 16], FPR2
diff --git a/bench/x86-64/store_mem_avx.ptt b/bench/x86-64/store_mem_avx.ptt
index e023fd0..c4dd0a4 100644
--- a/bench/x86-64/store_mem_avx.ptt
+++ b/bench/x86-64/store_mem_avx.ptt
@@ -2,10 +2,16 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-vmovaps ymm1, [SCALAR]
-vmovaps ymm2, [SCALAR]
-vmovaps ymm3, [SCALAR]
-vmovaps ymm4, [SCALAR]
+DESC Double-precision store, uses AVX and non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vmovaps ymm1, [rip+SCALAR]
+vmovaps ymm2, [rip+SCALAR]
+vmovaps ymm3, [rip+SCALAR]
+vmovaps ymm4, [rip+SCALAR]
 LOOP 16
 vmovntpd    [STR0 + GPR1 * 8]     , ymm1
 vmovntpd    [STR0 + GPR1 * 8 + 32], ymm2
diff --git a/bench/x86-64/store_mem_sse.ptt b/bench/x86-64/store_mem_sse.ptt
index 0a0222d..54aeed3 100644
--- a/bench/x86-64/store_mem_sse.ptt
+++ b/bench/x86-64/store_mem_sse.ptt
@@ -2,10 +2,16 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision store, uses SSE and non-temporal stores
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
 LOOP 8
 movntpd    [STR0 + GPR1 * 8]     , FPR1
 movntpd    [STR0 + GPR1 * 8 + 16], FPR2
diff --git a/bench/x86-64/store_plain.ptt b/bench/x86-64/store_plain.ptt
deleted file mode 100644
index 0f667cd..0000000
--- a/bench/x86-64/store_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 8
-movsd FPR1, [SCALAR]
-movsd FPR2, [SCALAR]
-movsd FPR3, [SCALAR]
-movsd FPR4, [SCALAR]
-LOOP 4
-#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
-movsd    [STR0 + GPR1 * 8]     , FPR1
-movsd    [STR0 + GPR1 * 8 + 8], FPR2
-movsd    [STR0 + GPR1 * 8 + 16], FPR3
-movsd    [STR0 + GPR1 * 8 + 24], FPR4
-
diff --git a/bench/x86-64/store_sse.ptt b/bench/x86-64/store_sse.ptt
index 4ef9ab9..8e124b2 100644
--- a/bench/x86-64/store_sse.ptt
+++ b/bench/x86-64/store_sse.ptt
@@ -2,12 +2,17 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+DESC Double-precision store, optimized for SSE
+LOADS 0
+STORES 1
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+movaps FPR1, [rip+SCALAR]
+movaps FPR2, [rip+SCALAR]
+movaps FPR3, [rip+SCALAR]
+movaps FPR4, [rip+SCALAR]
 LOOP 8
-#mov       GPR14, [STR0 + GPR1 * 8 + 256] 
 movaps    [STR0 + GPR1 * 8]     , FPR1
 movaps    [STR0 + GPR1 * 8 + 16], FPR2
 movaps    [STR0 + GPR1 * 8 + 32], FPR3
diff --git a/bench/x86-64/stream.ptt b/bench/x86-64/stream.ptt
index 7c84c3c..554243a 100644
--- a/bench/x86-64/stream.ptt
+++ b/bench/x86-64/stream.ptt
@@ -2,22 +2,28 @@ STREAMS 3
 TYPE DOUBLE
 FLOPS 2
 BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-mulpd     FPR1, FPR5
-addpd     FPR1, [STR2 + GPR1*8]
-mulpd     FPR2, FPR5
-addpd     FPR2, [STR2 + GPR1*8+16]
-mulpd     FPR3, FPR5
-addpd     FPR3, [STR2 + GPR1*8+32]
-mulpd     FPR4, FPR5
-addpd     FPR4, [STR2 + GPR1*8+48]
-movaps    [STR0 + GPR1*8]   , FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movsd    FPR1, [STR1 + GPR1*8]
+movsd    FPR2, [STR1 + GPR1*8+8]
+movsd    FPR3, [STR1 + GPR1*8+16]
+movsd    FPR4, [STR1 + GPR1*8+24]
+mulsd    FPR1, FPR5
+addsd    FPR1, [STR2 + GPR1*8]
+mulsd    FPR2, FPR5
+addsd    FPR2, [STR2 + GPR1*8+8]
+mulsd    FPR3, FPR5
+addsd    FPR3, [STR2 + GPR1*8+16]
+mulsd    FPR4, FPR5
+addsd    FPR4, [STR2 + GPR1*8+24]
+movsd    [STR0 + GPR1*8]   , FPR1
+movsd    [STR0 + GPR1*8+8] , FPR2
+movsd    [STR0 + GPR1*8+16], FPR3
+movsd    [STR0 + GPR1*8+24], FPR4
 
diff --git a/bench/x86-64/stream_avx.ptt b/bench/x86-64/stream_avx.ptt
index 8fbaf7c..0ebbb74 100644
--- a/bench/x86-64/stream_avx.ptt
+++ b/bench/x86-64/stream_avx.ptt
@@ -1,22 +1,29 @@
 STREAMS 3
-TYPE SINGLE
-FLOPS 4
-BYTES 48
-vbroadcastss ymm1, [SCALAR]
-LOOP 8
-vmovaps   ymm2, [STR1 + GPR1*8]
-vmovaps   ymm3, [STR1 + GPR1*8+16]
-vmovaps   ymm4, [STR1 + GPR1*8+32]
-vmovaps   ymm5, [STR1 + GPR1*8+48]
-vmulps    ymm2, ymm2, ymm1
-vaddps    ymm2, ymm2, [STR2 + GPR1*8]
-vmulps    ymm3, ymm3, ymm1
-vaddps    ymm3, ymm3, [STR2 + GPR1*8]
-vmulps    ymm4, ymm4, ymm1
-vaddps    ymm4, ymm4, [STR2 + GPR1*8]
-vmulps    ymm5, ymm5, ymm1
-vaddps    ymm5, ymm5, [STR2 + GPR1*8]
-vmovaps   [STR0 + GPR1*8], ymm2
-vmovaps   [STR0 + GPR1*8+16], ymm3
-vmovaps   [STR0 + GPR1*8+32], ymm4
-vmovaps   [STR0 + GPR1*8+48], ymm5
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vmulpd     ymm1, ymm1, ymm5
+vaddpd     ymm1, ymm1, [STR2 + GPR1*8]
+vmulpd     ymm2, ymm2, ymm5
+vaddpd     ymm2, ymm2, [STR2 + GPR1*8+32]
+vmulpd     ymm3, ymm3, ymm5
+vaddpd     ymm3, ymm3, [STR2 + GPR1*8+64]
+vmulpd     ymm4, ymm4, ymm5
+vaddpd     ymm4, ymm4, [STR2 + GPR1*8+96]
+vmovaps    [STR0 + GPR1*8]   , ymm1
+vmovaps    [STR0 + GPR1*8+32], ymm2
+vmovaps    [STR0 + GPR1*8+64], ymm3
+vmovaps    [STR0 + GPR1*8+96], ymm4
+
diff --git a/bench/x86-64/stream_avx_fma.ptt b/bench/x86-64/stream_avx_fma.ptt
new file mode 100644
index 0000000..a868f61
--- /dev/null
+++ b/bench/x86-64/stream_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR2 + GPR1*8]
+vfmadd213pd ymm2, ymm5, [STR2 + GPR1*8+32]
+vfmadd213pd ymm3, ymm5, [STR2 + GPR1*8+64]
+vfmadd213pd ymm4, ymm5, [STR2 + GPR1*8+96]
+vmovaps    [STR0 + GPR1*8]   , ymm1
+vmovaps    [STR0 + GPR1*8+32], ymm2
+vmovaps    [STR0 + GPR1*8+64], ymm3
+vmovaps    [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/stream_mem.ptt b/bench/x86-64/stream_mem.ptt
index b8364cc..fd0f8f1 100644
--- a/bench/x86-64/stream_mem.ptt
+++ b/bench/x86-64/stream_mem.ptt
@@ -2,10 +2,27 @@ STREAMS 3
 TYPE DOUBLE
 FLOPS 2
 BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 2
-movaps    FPR1, [STR2 + GPR1*8]
-mulpd     FPR1, FPR5
-addpd     FPR1, [STR1 + GPR1*8]
-movntpd   [STR0 + GPR1*8], FPR1
-
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movsd FPR5, [rip+SCALAR]
+LOOP 4
+movsd    FPR1, [STR1 + GPR1*8]
+movsd    FPR2, [STR1 + GPR1*8+8]
+movsd    FPR3, [STR1 + GPR1*8+16]
+movsd    FPR4, [STR1 + GPR1*8+24]
+mulsd    FPR1, FPR5
+addsd    FPR1, [STR2 + GPR1*8]
+mulsd    FPR2, FPR5
+addsd    FPR2, [STR2 + GPR1*8+8]
+mulsd    FPR3, FPR5
+addsd    FPR3, [STR2 + GPR1*8+16]
+mulsd    FPR4, FPR5
+addsd    FPR4, [STR2 + GPR1*8+24]
+movntdq   [STR0 + GPR1*8], FPR1
+movntdq   [STR0 + GPR1*8+8], FPR2
+movntdq   [STR0 + GPR1*8+16], FPR3
+movntdq   [STR0 + GPR1*8+24], FPR4
diff --git a/bench/x86-64/stream_mem_avx.ptt b/bench/x86-64/stream_mem_avx.ptt
new file mode 100644
index 0000000..1a138f4
--- /dev/null
+++ b/bench/x86-64/stream_mem_avx.ptt
@@ -0,0 +1,17 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+vmovaps ymm5, [rip+SCALAR]
+LOOP 4
+vmovaps    ymm1, [STR2 + GPR1*8]
+vmulpd     ymm1, ymm1, ymm5
+vaddpd     ymm1, ymm1, [STR1 + GPR1*8]
+vmovntpd   [STR0 + GPR1*8], ymm1
+
diff --git a/bench/x86-64/stream_mem_avx_fma.ptt b/bench/x86-64/stream_mem_avx_fma.ptt
new file mode 100644
index 0000000..05bbbc2
--- /dev/null
+++ b/bench/x86-64/stream_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm5, [rip+SCALAR]
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR2 + GPR1*8]
+vfmadd213pd ymm2, ymm5, [STR2 + GPR1*8+32]
+vfmadd213pd ymm3, ymm5, [STR2 + GPR1*8+64]
+vfmadd213pd ymm4, ymm5, [STR2 + GPR1*8+96]
+vmovntpd   [STR0 + GPR1*8], ymm1
+vmovntpd   [STR0 + GPR1*8+32], ymm2
+vmovntpd   [STR0 + GPR1*8+64], ymm3
+vmovntpd   [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/stream_mem_sse.ptt b/bench/x86-64/stream_mem_sse.ptt
new file mode 100644
index 0000000..6b7106a
--- /dev/null
+++ b/bench/x86-64/stream_mem_sse.ptt
@@ -0,0 +1,17 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movaps FPR5, [rip+SCALAR]
+LOOP 2
+movaps    FPR1, [STR2 + GPR1*8]
+mulpd     FPR1, FPR5
+addpd     FPR1, [STR1 + GPR1*8]
+movntpd   [STR0 + GPR1*8], FPR1
+
diff --git a/bench/x86-64/stream_mem_sse_fma.ptt b/bench/x86-64/stream_mem_sse_fma.ptt
new file mode 100644
index 0000000..22b2758
--- /dev/null
+++ b/bench/x86-64/stream_mem_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movaps FPR5, [rip+SCALAR]
+LOOP 2
+movapd    FPR1, [STR2 + GPR1*8]
+vfmadd213pd FPR1, FPR5, [STR1 + GPR1*8]
+movntpd   [STR0 + GPR1*8], FPR1
diff --git a/bench/x86-64/stream_sp.ptt b/bench/x86-64/stream_sp.ptt
new file mode 100644
index 0000000..cedba15
--- /dev/null
+++ b/bench/x86-64/stream_sp.ptt
@@ -0,0 +1,45 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 35
+UOPS 50
+movss FPR9, [rip+SCALAR]
+LOOP 8
+movss    FPR1, [STR1 + GPR1*4]
+movss    FPR2, [STR1 + GPR1*4+4]
+movss    FPR3, [STR1 + GPR1*4+8]
+movss    FPR4, [STR1 + GPR1*4+12]
+movss    FPR5, [STR1 + GPR1*4+16]
+movss    FPR6, [STR1 + GPR1*4+20]
+movss    FPR7, [STR1 + GPR1*4+24]
+movss    FPR8, [STR1 + GPR1*4+28]
+mulss    FPR1, FPR9
+addss    FPR1, [STR2 + GPR1*4]
+mulss    FPR2, FPR9
+addss    FPR2, [STR2 + GPR1*4+4]
+mulss    FPR3, FPR9
+addss    FPR3, [STR2 + GPR1*4+8]
+mulss    FPR4, FPR9
+addss    FPR4, [STR2 + GPR1*4+12]
+mulss    FPR5, FPR9
+addss    FPR5, [STR2 + GPR1*4+16]
+mulss    FPR6, FPR9
+addss    FPR6, [STR2 + GPR1*4+20]
+mulss    FPR7, FPR9
+addss    FPR7, [STR2 + GPR1*4+24]
+mulss    FPR8, FPR9
+addss    FPR8, [STR2 + GPR1*4+28]
+movss    [STR0 + GPR1*4]   , FPR1
+movss    [STR0 + GPR1*4+4] , FPR2
+movss    [STR0 + GPR1*4+8] , FPR3
+movss    [STR0 + GPR1*4+12], FPR4
+movss    [STR0 + GPR1*4+16], FPR5
+movss    [STR0 + GPR1*4+20], FPR6
+movss    [STR0 + GPR1*4+24], FPR7
+movss    [STR0 + GPR1*4+28], FPR8
+
diff --git a/bench/x86-64/stream_sp_avx.ptt b/bench/x86-64/stream_sp_avx.ptt
new file mode 100644
index 0000000..f01a6ff
--- /dev/null
+++ b/bench/x86-64/stream_sp_avx.ptt
@@ -0,0 +1,28 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps   ymm2, [STR1 + GPR1*4]
+vmovaps   ymm3, [STR1 + GPR1*4+32]
+vmovaps   ymm4, [STR1 + GPR1*4+64]
+vmovaps   ymm5, [STR1 + GPR1*4+96]
+vmulps    ymm2, ymm2, ymm1
+vaddps    ymm2, ymm2, [STR2 + GPR1*4]
+vmulps    ymm3, ymm3, ymm1
+vaddps    ymm3, ymm3, [STR2 + GPR1*4+32]
+vmulps    ymm4, ymm4, ymm1
+vaddps    ymm4, ymm4, [STR2 + GPR1*4+64]
+vmulps    ymm5, ymm5, ymm1
+vaddps    ymm5, ymm5, [STR2 + GPR1*4+96]
+vmovaps   [STR0 + GPR1*4], ymm2
+vmovaps   [STR0 + GPR1*4+32], ymm3
+vmovaps   [STR0 + GPR1*4+64], ymm4
+vmovaps   [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_avx_fma.ptt b/bench/x86-64/stream_sp_avx_fma.ptt
new file mode 100644
index 0000000..351b84f
--- /dev/null
+++ b/bench/x86-64/stream_sp_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps   ymm2, [STR1 + GPR1*4]
+vmovaps   ymm3, [STR1 + GPR1*4+32]
+vmovaps   ymm4, [STR1 + GPR1*4+64]
+vmovaps   ymm5, [STR1 + GPR1*4+96]
+vfmadd213ps ymm2, ymm1, [STR2 + GPR1*4]
+vfmadd213ps ymm3, ymm1, [STR2 + GPR1*4+32]
+vfmadd213ps ymm4, ymm1, [STR2 + GPR1*4+64]
+vfmadd213ps ymm5, ymm1, [STR2 + GPR1*4+96]
+vmovaps   [STR0 + GPR1*4], ymm2
+vmovaps   [STR0 + GPR1*4+32], ymm3
+vmovaps   [STR0 + GPR1*4+64], ymm4
+vmovaps   [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_avx.ptt b/bench/x86-64/stream_sp_mem_avx.ptt
new file mode 100644
index 0000000..5fee0ec
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_avx.ptt
@@ -0,0 +1,28 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps   ymm2, [STR1 + GPR1*4]
+vmovaps   ymm3, [STR1 + GPR1*4+32]
+vmovaps   ymm4, [STR1 + GPR1*4+64]
+vmovaps   ymm5, [STR1 + GPR1*4+96]
+vmulps    ymm2, ymm2, ymm1
+vaddps    ymm2, ymm2, [STR2 + GPR1*4]
+vmulps    ymm3, ymm3, ymm1
+vaddps    ymm3, ymm3, [STR2 + GPR1*4+32]
+vmulps    ymm4, ymm4, ymm1
+vaddps    ymm4, ymm4, [STR2 + GPR1*4+64]
+vmulps    ymm5, ymm5, ymm1
+vaddps    ymm5, ymm5, [STR2 + GPR1*4+96]
+vmovntps    [STR0 + GPR1*4], ymm2
+vmovntps    [STR0 + GPR1*4+32], ymm3
+vmovntps    [STR0 + GPR1*4+64], ymm4
+vmovntps    [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_avx_fma.ptt b/bench/x86-64/stream_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..22c6a4d
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_avx_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for AVX FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+vmovaps ymm1, [rip+SCALAR]
+LOOP 32
+vmovaps   ymm2, [STR1 + GPR1*4]
+vmovaps   ymm3, [STR1 + GPR1*4+32]
+vmovaps   ymm4, [STR1 + GPR1*4+64]
+vmovaps   ymm5, [STR1 + GPR1*4+96]
+vfmadd213ps ymm2, ymm1, [STR2 + GPR1*4]
+vfmadd213ps ymm3, ymm1, [STR2 + GPR1*4+32]
+vfmadd213ps ymm4, ymm1, [STR2 + GPR1*4+64]
+vfmadd213ps ymm5, ymm1, [STR2 + GPR1*4+96]
+vmovntps   [STR0 + GPR1*4], ymm2
+vmovntps   [STR0 + GPR1*4+32], ymm3
+vmovntps   [STR0 + GPR1*4+64], ymm4
+vmovntps   [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/stream_sp_mem_sse.ptt b/bench/x86-64/stream_sp_mem_sse.ptt
new file mode 100644
index 0000000..b92c3ae
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_sse.ptt
@@ -0,0 +1,16 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps    FPR1, [STR2 + GPR1*4]
+mulps     FPR1, FPR5
+addps     FPR1, [STR1 + GPR1*4]
+movntps    [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_mem_sse_fma.ptt b/bench/x86-64/stream_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..9ca42ca
--- /dev/null
+++ b/bench/x86-64/stream_sp_mem_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps    FPR1, [STR2 + GPR1*4]
+vfmadd213ps FPR1, FPR5, [STR1 + GPR1*4]
+movntps    [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_sse.ptt b/bench/x86-64/stream_sp_sse.ptt
new file mode 100644
index 0000000..f82e299
--- /dev/null
+++ b/bench/x86-64/stream_sp_sse.ptt
@@ -0,0 +1,16 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 7
+UOPS 8
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps    FPR1, [STR2 + GPR1*4]
+mulps     FPR1, FPR5
+addps     FPR1, [STR1 + GPR1*4]
+movaps    [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sp_sse_fma.ptt b/bench/x86-64/stream_sp_sse_fma.ptt
new file mode 100644
index 0000000..28a87d4
--- /dev/null
+++ b/bench/x86-64/stream_sp_sse_fma.ptt
@@ -0,0 +1,15 @@
+STREAMS 3
+TYPE SINGLE
+FLOPS 2
+BYTES 12
+DESC Single-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 6
+UOPS 7
+movss FPR5, [rip+SCALAR]
+LOOP 4
+movaps    FPR1, [STR2 + GPR1*4]
+vfmadd213ps FPR1, FPR5, [STR1 + GPR1*4]
+movaps    [STR0 + GPR1*4], FPR1
diff --git a/bench/x86-64/stream_sse.ptt b/bench/x86-64/stream_sse.ptt
new file mode 100644
index 0000000..c373336
--- /dev/null
+++ b/bench/x86-64/stream_sse.ptt
@@ -0,0 +1,29 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+movaps FPR5, [rip+SCALAR]
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+mulpd     FPR1, FPR5
+addpd     FPR1, [STR2 + GPR1*8]
+mulpd     FPR2, FPR5
+addpd     FPR2, [STR2 + GPR1*8+16]
+mulpd     FPR3, FPR5
+addpd     FPR3, [STR2 + GPR1*8+32]
+mulpd     FPR4, FPR5
+addpd     FPR4, [STR2 + GPR1*8+48]
+movaps    [STR0 + GPR1*8]   , FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/stream_sse_fma.ptt b/bench/x86-64/stream_sse_fma.ptt
new file mode 100644
index 0000000..7b3a338
--- /dev/null
+++ b/bench/x86-64/stream_sse_fma.ptt
@@ -0,0 +1,24 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), optimized for SSE FMAs
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 15
+UOPS 22
+movaps FPR5, [rip+SCALAR]
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR2 + GPR1*8]
+vfmadd213pd FPR2, FPR5, [STR2 + GPR1*8+16]
+vfmadd213pd FPR3, FPR5, [STR2 + GPR1*8+32]
+vfmadd213pd FPR4, FPR5, [STR2 + GPR1*8+48]
+movaps    [STR0 + GPR1*8]   , FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/striad_avx.ptt b/bench/x86-64/striad_avx.ptt
deleted file mode 100644
index b3c1317..0000000
--- a/bench/x86-64/striad_avx.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-vmovaps ymm5, [SCALAR]
-LOOP 16
-vmovaps    ymm1, [STR1 + GPR1*8]
-vmovaps    ymm2, [STR1 + GPR1*8+32]
-vmovaps    ymm3, [STR1 + GPR1*8+64]
-vmovaps    ymm4, [STR1 + GPR1*8+96]
-vmulpd     ymm1, ymm1, ymm5
-vaddpd     ymm1, ymm1, [STR2 + GPR1*8]
-vmulpd     ymm2, ymm2, ymm5
-vaddpd     ymm2, ymm2, [STR2 + GPR1*8+32]
-vmulpd     ymm3, ymm3, ymm5
-vaddpd     ymm3, ymm3, [STR2 + GPR1*8+64]
-vmulpd     ymm4, ymm4, ymm5
-vaddpd     ymm4, ymm4, [STR2 + GPR1*8+96]
-vmovaps    [STR0 + GPR1*8]   , ymm1
-vmovaps    [STR0 + GPR1*8+32], ymm2
-vmovaps    [STR0 + GPR1*8+64], ymm3
-vmovaps    [STR0 + GPR1*8+96], ymm4
-
diff --git a/bench/x86-64/striad_mem_avx.ptt b/bench/x86-64/striad_mem_avx.ptt
deleted file mode 100644
index cef2688..0000000
--- a/bench/x86-64/striad_mem_avx.ptt
+++ /dev/null
@@ -1,11 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-vmovaps ymm5, [SCALAR]
-LOOP 4
-vmovaps    ymm1, [STR2 + GPR1*8]
-vmulpd     ymm1, ymm1, ymm5
-vaddpd     ymm1, ymm1, [STR1 + GPR1*8]
-vmovntpd   [STR0 + GPR1*8], ymm1
-
diff --git a/bench/x86-64/striad_mem_sse.ptt b/bench/x86-64/striad_mem_sse.ptt
deleted file mode 100644
index b8364cc..0000000
--- a/bench/x86-64/striad_mem_sse.ptt
+++ /dev/null
@@ -1,11 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 2
-movaps    FPR1, [STR2 + GPR1*8]
-mulpd     FPR1, FPR5
-addpd     FPR1, [STR1 + GPR1*8]
-movntpd   [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/striad_plain.ptt b/bench/x86-64/striad_plain.ptt
deleted file mode 100644
index 7b29664..0000000
--- a/bench/x86-64/striad_plain.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movss FPR5, [SCALAR]
-LOOP 4
-movsd    FPR1, [STR1 + GPR1*8]
-movsd    FPR2, [STR1 + GPR1*8+8]
-movsd    FPR3, [STR1 + GPR1*8+16]
-movsd    FPR4, [STR1 + GPR1*8+24]
-mulsd    FPR1, FPR5
-addsd    FPR1, [STR2 + GPR1*8]
-mulsd    FPR2, FPR5
-addsd    FPR2, [STR2 + GPR1*8+8]
-mulsd    FPR3, FPR5
-addsd    FPR3, [STR2 + GPR1*8+16]
-mulsd    FPR4, FPR5
-addsd    FPR4, [STR2 + GPR1*8+24]
-movsd    [STR0 + GPR1*8]   , FPR1
-movsd    [STR0 + GPR1*8+8] , FPR2
-movsd    [STR0 + GPR1*8+16], FPR3
-movsd    [STR0 + GPR1*8+24], FPR4
-
diff --git a/bench/x86-64/striad_sse.ptt b/bench/x86-64/striad_sse.ptt
deleted file mode 100644
index 7c84c3c..0000000
--- a/bench/x86-64/striad_sse.ptt
+++ /dev/null
@@ -1,23 +0,0 @@
-STREAMS 3
-TYPE DOUBLE
-FLOPS 2
-BYTES 24
-movaps FPR5, [SCALAR]
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-mulpd     FPR1, FPR5
-addpd     FPR1, [STR2 + GPR1*8]
-mulpd     FPR2, FPR5
-addpd     FPR2, [STR2 + GPR1*8+16]
-mulpd     FPR3, FPR5
-addpd     FPR3, [STR2 + GPR1*8+32]
-mulpd     FPR4, FPR5
-addpd     FPR4, [STR2 + GPR1*8+48]
-movaps    [STR0 + GPR1*8]   , FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/sum.ptt b/bench/x86-64/sum.ptt
index 3374843..a75fa93 100644
--- a/bench/x86-64/sum.ptt
+++ b/bench/x86-64/sum.ptt
@@ -1,23 +1,29 @@
 STREAMS 1
-TYPE SINGLE
+TYPE DOUBLE
 FLOPS 1
-BYTES 4
-xorps FPR1, FPR1
-movaps FPR2, FPR1
-movaps FPR3, FPR1
-movaps FPR4, FPR1
-movaps FPR5, FPR1
-movaps FPR6, FPR1
-movaps FPR7, FPR1
-movaps FPR8, FPR1
-LOOP 32
-addps    FPR1, [STR0 + GPR1 * 4]
-addps    FPR2, [STR0 + GPR1 * 4 + 16]
-addps    FPR3, [STR0 + GPR1 * 4 + 32]
-addps    FPR4, [STR0 + GPR1 * 4 + 48]
-addps    FPR5, [STR0 + GPR1 * 4 + 64]
-addps    FPR6, [STR0 + GPR1 * 4 + 80]
-addps    FPR7, [STR0 + GPR1 * 4 + 96]
-addps    FPR8, [STR0 + GPR1 * 4 + 112]
+BYTES 8
+DESC Double-precision sum of a vector, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+xorpd FPR1, FPR1
+movapd FPR2, FPR1
+movapd FPR3, FPR1
+movapd FPR4, FPR1
+movapd FPR5, FPR1
+movapd FPR6, FPR1
+movapd FPR7, FPR1
+movapd FPR8, FPR1
+LOOP 8
+addsd    FPR1, [STR0 + GPR1 * 8]
+addsd    FPR2, [STR0 + GPR1 * 8 + 8]
+addsd    FPR3, [STR0 + GPR1 * 8 + 16]
+addsd    FPR4, [STR0 + GPR1 * 8 + 24]
+addsd    FPR5, [STR0 + GPR1 * 8 + 32]
+addsd    FPR6, [STR0 + GPR1 * 8 + 40]
+addsd    FPR7, [STR0 + GPR1 * 8 + 48]
+addsd    FPR8, [STR0 + GPR1 * 8 + 56]
 
 
diff --git a/bench/x86-64/sum_avx.ptt b/bench/x86-64/sum_avx.ptt
index e2e8e40..29d8ff0 100644
--- a/bench/x86-64/sum_avx.ptt
+++ b/bench/x86-64/sum_avx.ptt
@@ -1,14 +1,30 @@
 STREAMS 1
-TYPE SINGLE
+TYPE DOUBLE
 FLOPS 1
-BYTES 4
-vxorps  ymm1, ymm1, ymm1
-vmovaps ymm2, ymm1
-vmovaps ymm3, ymm1
-vmovaps ymm4, ymm1
+BYTES 8
+DESC Double-precision sum of a vector, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+vxorpd FPR9, FPR9, FPR9
+vxorpd FPR1, FPR1, FPR1
+vmovapd FPR2, FPR1
+vmovapd FPR3, FPR1
+vmovapd FPR4, FPR1
+vmovapd FPR5, FPR1
+vmovapd FPR6, FPR1
+vmovapd FPR7, FPR1
+vmovapd FPR8, FPR1
 LOOP 32
-vaddps     ymm1, ymm1, [STR0 + GPR1*4]
-vaddps     ymm2, ymm2, [STR0 + GPR1*4+32]
-vaddps     ymm3, ymm3, [STR0 + GPR1*4+64]
-vaddps     ymm4, ymm4, [STR0 + GPR1*4+96]
+vaddpd    FPR1, FPR1, [STR0 + GPR1 * 8]
+vaddpd    FPR2, FPR2, [STR0 + GPR1 * 8 + 32]
+vaddpd    FPR3, FPR3, [STR0 + GPR1 * 8 + 64]
+vaddpd    FPR4, FPR4, [STR0 + GPR1 * 8 + 96]
+vaddpd    FPR5, FPR5, [STR0 + GPR1 * 8 + 128]
+vaddpd    FPR6, FPR6, [STR0 + GPR1 * 8 + 160]
+vaddpd    FPR7, FPR7, [STR0 + GPR1 * 8 + 192]
+vaddpd    FPR8, FPR8, [STR0 + GPR1 * 8 + 224]
+
 
diff --git a/bench/x86-64/sum_plain.ptt b/bench/x86-64/sum_plain.ptt
deleted file mode 100644
index 23fe237..0000000
--- a/bench/x86-64/sum_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE SINGLE
-FLOPS 1
-BYTES 4
-xorps FPR1, FPR1
-xorps FPR2, FPR2
-xorps FPR3, FPR3
-xorps FPR4, FPR4
-LOOP 4
-addss    FPR1, [STR0 + GPR1 * 4]
-addss    FPR2, [STR0 + GPR1 * 4 + 4]
-addss    FPR3, [STR0 + GPR1 * 4 + 8]
-addss    FPR4, [STR0 + GPR1 * 4 + 12]
-
-
diff --git a/bench/x86-64/sum_sp.ptt b/bench/x86-64/sum_sp.ptt
new file mode 100644
index 0000000..21a6702
--- /dev/null
+++ b/bench/x86-64/sum_sp.ptt
@@ -0,0 +1,21 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+xorps FPR1, FPR1
+xorps FPR2, FPR2
+xorps FPR3, FPR3
+xorps FPR4, FPR4
+LOOP 4
+addss    FPR1, [STR0 + GPR1 * 4]
+addss    FPR2, [STR0 + GPR1 * 4 + 4]
+addss    FPR3, [STR0 + GPR1 * 4 + 8]
+addss    FPR4, [STR0 + GPR1 * 4 + 12]
+
+
diff --git a/bench/x86-64/sum_sp_avx.ptt b/bench/x86-64/sum_sp_avx.ptt
new file mode 100644
index 0000000..4a3a6e9
--- /dev/null
+++ b/bench/x86-64/sum_sp_avx.ptt
@@ -0,0 +1,20 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, optimized for AVX
+LOADS 1
+STORES 0
+INSTR_CONST 20
+INSTR_LOOP 7
+UOPS 10
+vxorps  ymm1, ymm1, ymm1
+vmovaps ymm2, ymm1
+vmovaps ymm3, ymm1
+vmovaps ymm4, ymm1
+LOOP 32
+vaddps     ymm1, ymm1, [STR0 + GPR1*4]
+vaddps     ymm2, ymm2, [STR0 + GPR1*4+32]
+vaddps     ymm3, ymm3, [STR0 + GPR1*4+64]
+vaddps     ymm4, ymm4, [STR0 + GPR1*4+96]
+
diff --git a/bench/x86-64/sum_sp_sse.ptt b/bench/x86-64/sum_sp_sse.ptt
new file mode 100644
index 0000000..21cff6d
--- /dev/null
+++ b/bench/x86-64/sum_sp_sse.ptt
@@ -0,0 +1,29 @@
+STREAMS 1
+TYPE SINGLE
+FLOPS 1
+BYTES 4
+DESC Single-precision sum of a vector, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
+xorps FPR1, FPR1
+movaps FPR2, FPR1
+movaps FPR3, FPR1
+movaps FPR4, FPR1
+movaps FPR5, FPR1
+movaps FPR6, FPR1
+movaps FPR7, FPR1
+movaps FPR8, FPR1
+LOOP 32
+addps    FPR1, [STR0 + GPR1 * 4]
+addps    FPR2, [STR0 + GPR1 * 4 + 16]
+addps    FPR3, [STR0 + GPR1 * 4 + 32]
+addps    FPR4, [STR0 + GPR1 * 4 + 48]
+addps    FPR5, [STR0 + GPR1 * 4 + 64]
+addps    FPR6, [STR0 + GPR1 * 4 + 80]
+addps    FPR7, [STR0 + GPR1 * 4 + 96]
+addps    FPR8, [STR0 + GPR1 * 4 + 112]
+
+
diff --git a/bench/x86-64/sum_sse.ptt b/bench/x86-64/sum_sse.ptt
index 3e7a2bb..8aad8cf 100644
--- a/bench/x86-64/sum_sse.ptt
+++ b/bench/x86-64/sum_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 1
 BYTES 8
+DESC Double-precision sum of a vector, optimized for SSE
+LOADS 1
+STORES 0
+INSTR_CONST 24
+INSTR_LOOP 11
+UOPS 18
 xorpd FPR1, FPR1
 movapd FPR2, FPR1
 movapd FPR3, FPR1
diff --git a/bench/x86-64/triad.ptt b/bench/x86-64/triad.ptt
index d521aa0..4eec70f 100644
--- a/bench/x86-64/triad.ptt
+++ b/bench/x86-64/triad.ptt
@@ -2,21 +2,27 @@ STREAMS 4
 TYPE DOUBLE
 FLOPS 2
 BYTES 32
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-addpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-addpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-addpd     FPR4, [STR3 + GPR1*8+48]
-movaps    [STR0 + GPR1*8], FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), only scalar operations
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 4
+movsd    FPR1, [STR1 + GPR1*8]
+movsd    FPR2, [STR1 + GPR1*8+8]
+movsd    FPR3, [STR1 + GPR1*8+16]
+movsd    FPR4, [STR1 + GPR1*8+24]
+mulsd     FPR1, [STR2 + GPR1*8]
+addsd     FPR1, [STR3 + GPR1*8]
+mulsd     FPR2, [STR2 + GPR1*8+8]
+addsd     FPR2, [STR3 + GPR1*8+8]
+mulsd     FPR3, [STR2 + GPR1*8+16]
+addsd     FPR3, [STR3 + GPR1*8+16]
+mulsd     FPR4, [STR2 + GPR1*8+24]
+addsd     FPR4, [STR3 + GPR1*8+24]
+movsd    [STR0 + GPR1*8], FPR1
+movsd    [STR0 + GPR1*8+8], FPR2
+movsd    [STR0 + GPR1*8+16], FPR3
+movsd    [STR0 + GPR1*8+24], FPR4
 
diff --git a/bench/x86-64/triad_avx.ptt b/bench/x86-64/triad_avx.ptt
index 3514cfd..7e83b0b 100644
--- a/bench/x86-64/triad_avx.ptt
+++ b/bench/x86-64/triad_avx.ptt
@@ -1,12 +1,28 @@
 STREAMS 4
 TYPE DOUBLE
 FLOPS 2
-BYTES 16
-LOOP 32
-vmovapd ymm1, [STR1 + GPR1]
-vmovapd ymm2, [STR2 + GPR1]
-vmovapd ymm3, [STR3 + GPR1]
-vmulpd  ymm0, ymm1, ymm2
-vaddpd  ymm0, ymm0, ymm3
-vmovapd [STR0 + GPR1], ymm0
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vmulpd    ymm1, ymm1, [STR2 + GPR1*8]
+vaddpd    ymm1, ymm1, [STR3 + GPR1*8]
+vmulpd    ymm2, ymm2, [STR2 + GPR1*8+32]
+vaddpd    ymm2, ymm2, [STR3 + GPR1*8+32]
+vmulpd    ymm3, ymm3, [STR2 + GPR1*8+64]
+vaddpd    ymm3, ymm3, [STR3 + GPR1*8+64]
+vmulpd    ymm4, ymm4, [STR2 + GPR1*8+96]
+vaddpd    ymm4, ymm4, [STR3 + GPR1*8+96]
+vmovaps    [STR0 + GPR1*8], ymm1
+vmovaps    [STR0 + GPR1*8+32], ymm2
+vmovaps    [STR0 + GPR1*8+64], ymm3
+vmovaps    [STR0 + GPR1*8+96], ymm4
 
diff --git a/bench/x86-64/triad_avx_fma.ptt b/bench/x86-64/triad_avx_fma.ptt
new file mode 100644
index 0000000..535fe8a
--- /dev/null
+++ b/bench/x86-64/triad_avx_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 16
+vmovaps    ymm1, [STR1 + GPR1*8]
+vmovaps    ymm2, [STR1 + GPR1*8+32]
+vmovaps    ymm3, [STR1 + GPR1*8+64]
+vmovaps    ymm4, [STR1 + GPR1*8+96]
+vmovaps    ymm5, [STR2 + GPR1*8]
+vmovaps    ymm6, [STR2 + GPR1*8+32]
+vmovaps    ymm7, [STR2 + GPR1*8+64]
+vmovaps    ymm8, [STR2 + GPR1*8+96]
+vfmadd213pd ymm1, ymm5, [STR3 + GPR1*8]
+vfmadd213pd ymm2, ymm6, [STR3 + GPR1*8+32]
+vfmadd213pd ymm3, ymm7, [STR3 + GPR1*8+64]
+vfmadd213pd ymm4, ymm8, [STR3 + GPR1*8+96]
+vmovaps [STR0 + GPR1*8], ymm1
+vmovaps [STR0 + GPR1*8+32], ymm2
+vmovaps [STR0 + GPR1*8+64], ymm3
+vmovaps [STR0 + GPR1*8+96], ymm4
diff --git a/bench/x86-64/triad_mem.ptt b/bench/x86-64/triad_mem.ptt
deleted file mode 100644
index 7c24748..0000000
--- a/bench/x86-64/triad_mem.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 2
-movaps    FPR1, [STR1 + GPR1*8]
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-movntpd   [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/triad_mem_avx.ptt b/bench/x86-64/triad_mem_avx.ptt
new file mode 100644
index 0000000..45af749
--- /dev/null
+++ b/bench/x86-64/triad_mem_avx.ptt
@@ -0,0 +1,18 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), uses AVX and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 4
+vmovapd ymm1, [STR1 + GPR1*8]
+vmovapd ymm2, [STR2 + GPR1*8]
+vmovapd ymm3, [STR3 + GPR1*8]
+vmulpd  ymm0, ymm1, ymm2
+vaddpd  ymm0, ymm0, ymm3
+vmovntpd [STR0 + GPR1*8], ymm0
+
diff --git a/bench/x86-64/triad_mem_avx_fma.ptt b/bench/x86-64/triad_mem_avx_fma.ptt
new file mode 100644
index 0000000..fbc73b7
--- /dev/null
+++ b/bench/x86-64/triad_mem_avx_fma.ptt
@@ -0,0 +1,20 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*8]
+vmovaps ymm2, [STR2 + GPR1*8]
+vmovaps ymm3, [STR1 + GPR1*8+32]
+vmovaps ymm4, [STR2 + GPR1*8+32]
+vfmadd213pd ymm1, ymm2, [STR3 + GPR1*8]
+vfmadd213pd ymm3, ymm4, [STR3 + GPR1*8+32]
+vmovntpd [STR0 + GPR1*8], ymm1
+vmovntpd [STR0 + GPR1*8+32], ymm3
+
diff --git a/bench/x86-64/triad_mem_sse.ptt b/bench/x86-64/triad_mem_sse.ptt
new file mode 100644
index 0000000..32e10a0
--- /dev/null
+++ b/bench/x86-64/triad_mem_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+mulpd     FPR1, [STR2 + GPR1*8]
+addpd     FPR1, [STR3 + GPR1*8]
+mulpd     FPR2, [STR2 + GPR1*8+16]
+addpd     FPR2, [STR3 + GPR1*8+16]
+mulpd     FPR3, [STR2 + GPR1*8+32]
+addpd     FPR3, [STR3 + GPR1*8+32]
+mulpd     FPR4, [STR2 + GPR1*8+48]
+addpd     FPR4, [STR3 + GPR1*8+48]
+movntpd   [STR0 + GPR1*8], FPR1
+movntpd    [STR0 + GPR1*8+16], FPR2
+movntpd    [STR0 + GPR1*8+32], FPR3
+movntpd    [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/triad_mem_sse_fma.ptt b/bench/x86-64/triad_mem_sse_fma.ptt
new file mode 100644
index 0000000..f96f194
--- /dev/null
+++ b/bench/x86-64/triad_mem_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+movaps    FPR5, [STR2 + GPR1*8]
+movaps    FPR6, [STR2 + GPR1*8+16]
+movaps    FPR7, [STR2 + GPR1*8+32]
+movaps    FPR8, [STR2 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR3 + GPR1*8]
+vfmadd213pd FPR2, FPR6, [STR3 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR3 + GPR1*8+32]
+vfmadd213pd FPR4, FPR8, [STR3 + GPR1*8+48]
+movntpd   [STR0 + GPR1*8], FPR1
+movntpd   [STR0 + GPR1*8+16], FPR2
+movntpd   [STR0 + GPR1*8+32], FPR3
+movntpd   [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/triad_sp.ptt b/bench/x86-64/triad_sp.ptt
new file mode 100644
index 0000000..17ba5f4
--- /dev/null
+++ b/bench/x86-64/triad_sp.ptt
@@ -0,0 +1,43 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), only scalar operations
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 35
+UOPS 58
+LOOP 8
+movss    FPR1, [STR1 + GPR1*4]
+movss    FPR2, [STR1 + GPR1*4 + 4]
+movss    FPR3, [STR1 + GPR1*4 + 8]
+movss    FPR4, [STR1 + GPR1*4 + 12]
+mulss    FPR1, [STR2 + GPR1*4]
+addss    FPR1, [STR3 + GPR1*4]
+movss    FPR5, [STR1 + GPR1*4 + 16]
+mulss    FPR2, [STR2 + GPR1*4 + 4]
+addss    FPR2, [STR3 + GPR1*4 + 4]
+movss    FPR6, [STR1 + GPR1*4 + 20]
+mulss    FPR3, [STR2 + GPR1*4 + 8]
+addss    FPR3, [STR3 + GPR1*4 + 8]
+movss    FPR7, [STR1 + GPR1*4 + 24]
+mulss    FPR4, [STR2 + GPR1*4 + 12]
+addss    FPR4, [STR3 + GPR1*4 + 12]
+movss    FPR8, [STR1 + GPR1*4 + 28]
+mulss    FPR5, [STR2 + GPR1*4 + 16]
+addss    FPR5, [STR3 + GPR1*4 + 16]
+mulss    FPR6, [STR2 + GPR1*4 + 20]
+addss    FPR6, [STR3 + GPR1*4 + 20]
+mulss    FPR7, [STR2 + GPR1*4 + 24]
+addss    FPR7, [STR3 + GPR1*4 + 24]
+mulss    FPR8, [STR2 + GPR1*4 + 28]
+addss    FPR8, [STR3 + GPR1*4 + 28]
+movss    [STR0 + GPR1*4], FPR1
+movss    [STR0 + GPR1*4 + 4], FPR2
+movss    [STR0 + GPR1*4 + 8], FPR3
+movss    [STR0 + GPR1*4 + 12], FPR4
+movss    [STR0 + GPR1*4 + 16], FPR5
+movss    [STR0 + GPR1*4 + 20], FPR6
+movss    [STR0 + GPR1*4 + 24], FPR7
+movss    [STR0 + GPR1*4 + 28], FPR8
diff --git a/bench/x86-64/triad_sp_avx.ptt b/bench/x86-64/triad_sp_avx.ptt
new file mode 100644
index 0000000..a977776
--- /dev/null
+++ b/bench/x86-64/triad_sp_avx.ptt
@@ -0,0 +1,18 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vmovaps ymm3, [STR3 + GPR1*4]
+vmulps  ymm0, ymm1, ymm2
+vaddps  ymm0, ymm0, ymm3
+vmovaps [STR0 + GPR1*4], ymm0
+
diff --git a/bench/x86-64/triad_sp_avx_fma.ptt b/bench/x86-64/triad_sp_avx_fma.ptt
new file mode 100644
index 0000000..4d78a58
--- /dev/null
+++ b/bench/x86-64/triad_sp_avx_fma.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 8
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vfmadd213ps ymm1, ymm2, [STR3 + GPR1*4]
+vmovaps [STR0 + GPR1*4], ymm1
+
diff --git a/bench/x86-64/triad_sp_mem_avx.ptt b/bench/x86-64/triad_sp_mem_avx.ptt
new file mode 100644
index 0000000..2a04586
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_avx.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 9
+UOPS 9
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vmulps  ymm0, ymm1, ymm2
+vaddps  ymm0, ymm0, [STR3 + GPR1*4]
+vmovntps   [STR0 + GPR1*4], ymm0
diff --git a/bench/x86-64/triad_sp_mem_avx_fma.ptt b/bench/x86-64/triad_sp_mem_avx_fma.ptt
new file mode 100644
index 0000000..72b5a8e
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_avx_fma.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for AVX FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 7
+UOPS 8
+LOOP 8
+vmovaps ymm1, [STR1 + GPR1*4]
+vmovaps ymm2, [STR2 + GPR1*4]
+vfmadd213ps ymm1, ymm2, [STR3 + GPR1*4]
+vmovntps   [STR0 + GPR1*4], ymm1
+
diff --git a/bench/x86-64/triad_sp_mem_sse.ptt b/bench/x86-64/triad_sp_mem_sse.ptt
new file mode 100644
index 0000000..38e8abc
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps    FPR1, [STR1 + GPR1*4]
+movaps    FPR2, [STR1 + GPR1*4+16]
+movaps    FPR3, [STR1 + GPR1*4+32]
+movaps    FPR4, [STR1 + GPR1*4+48]
+mulps     FPR1, [STR2 + GPR1*4]
+addps     FPR1, [STR3 + GPR1*4]
+mulps     FPR2, [STR2 + GPR1*4+16]
+addps     FPR2, [STR3 + GPR1*4+16]
+mulps     FPR3, [STR2 + GPR1*4+32]
+addps     FPR3, [STR3 + GPR1*4+32]
+mulps     FPR4, [STR2 + GPR1*4+48]
+addps     FPR4, [STR3 + GPR1*4+48]
+movntps    [STR0 + GPR1*4], FPR1
+movntps    [STR0 + GPR1*4+16], FPR2
+movntps    [STR0 + GPR1*4+32], FPR3
+movntps    [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_mem_sse_fma.ptt b/bench/x86-64/triad_sp_mem_sse_fma.ptt
new file mode 100644
index 0000000..6b3ba66
--- /dev/null
+++ b/bench/x86-64/triad_sp_mem_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs and non-temporal stores
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps    FPR1, [STR1 + GPR1*4]
+movaps    FPR2, [STR1 + GPR1*4+16]
+movaps    FPR3, [STR1 + GPR1*4+32]
+movaps    FPR4, [STR1 + GPR1*4+48]
+movaps    FPR5, [STR2 + GPR1*4]
+movaps    FPR6, [STR2 + GPR1*4+16]
+movaps    FPR7, [STR2 + GPR1*4+32]
+movaps    FPR8, [STR2 + GPR1*4+48]
+vfmadd213ps FPR1, FPR5, [STR3 + GPR1*4]
+vfmadd213ps FPR2, FPR6, [STR3 + GPR1*4+16]
+vfmadd213ps FPR3, FPR7, [STR3 + GPR1*4+32]
+vfmadd213ps FPR4, FPR8, [STR3 + GPR1*4+48]
+movntps   [STR0 + GPR1*4], FPR1
+movntps    [STR0 + GPR1*4+16], FPR2
+movntps    [STR0 + GPR1*4+32], FPR3
+movntps    [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_sse.ptt b/bench/x86-64/triad_sp_sse.ptt
new file mode 100644
index 0000000..deba3c5
--- /dev/null
+++ b/bench/x86-64/triad_sp_sse.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps    FPR1, [STR1 + GPR1*4]
+movaps    FPR2, [STR1 + GPR1*4+16]
+movaps    FPR3, [STR1 + GPR1*4+32]
+movaps    FPR4, [STR1 + GPR1*4+48]
+mulps     FPR1, [STR2 + GPR1*4]
+addps     FPR1, [STR3 + GPR1*4]
+mulps     FPR2, [STR2 + GPR1*4+16]
+addps     FPR2, [STR3 + GPR1*4+16]
+mulps     FPR3, [STR2 + GPR1*4+32]
+addps     FPR3, [STR3 + GPR1*4+32]
+mulps     FPR4, [STR2 + GPR1*4+48]
+addps     FPR4, [STR3 + GPR1*4+48]
+movaps    [STR0 + GPR1*4], FPR1
+movaps    [STR0 + GPR1*4+16], FPR2
+movaps    [STR0 + GPR1*4+32], FPR3
+movaps    [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_sp_sse_fma.ptt b/bench/x86-64/triad_sp_sse_fma.ptt
new file mode 100644
index 0000000..f2147da
--- /dev/null
+++ b/bench/x86-64/triad_sp_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE SINGLE
+FLOPS 2
+BYTES 16
+DESC Single-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 16
+movaps    FPR1, [STR1 + GPR1*4]
+movaps    FPR2, [STR1 + GPR1*4+16]
+movaps    FPR3, [STR1 + GPR1*4+32]
+movaps    FPR4, [STR1 + GPR1*4+48]
+movaps    FPR5, [STR2 + GPR1*4]
+movaps    FPR6, [STR2 + GPR1*4+16]
+movaps    FPR7, [STR2 + GPR1*4+32]
+movaps    FPR8, [STR2 + GPR1*4+48]
+vfmadd213ps FPR1, FPR5, [STR3 + GPR1*4]
+vfmadd213ps FPR2, FPR6, [STR3 + GPR1*4+16]
+vfmadd213ps FPR3, FPR7, [STR3 + GPR1*4+32]
+vfmadd213ps FPR4, FPR8, [STR3 + GPR1*4+48]
+movaps    [STR0 + GPR1*4], FPR1
+movaps    [STR0 + GPR1*4+16], FPR2
+movaps    [STR0 + GPR1*4+32], FPR3
+movaps    [STR0 + GPR1*4+48], FPR4
diff --git a/bench/x86-64/triad_split.ptt b/bench/x86-64/triad_split.ptt
deleted file mode 100644
index 7b30e47..0000000
--- a/bench/x86-64/triad_split.ptt
+++ /dev/null
@@ -1,30 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 8
-movapd    FPR1, [STR1 + GPR1*8]
-movapd    FPR2, [STR1 + GPR1*8+16]
-movapd    FPR3, [STR1 + GPR1*8+32]
-movapd    FPR4, [STR1 + GPR1*8+48]
-movapd    FPR5, [STR2 + GPR1*8]
-movapd    FPR6, [STR3 + GPR1*8]
-movapd    FPR7, [STR2 + GPR1*8+16]
-movapd    FPR8, [STR3 + GPR1*8+16]
-movapd    FPR9, [STR2 + GPR1*8+32]
-movapd    FPR10, [STR3 + GPR1*8+32]
-movapd    FPR11, [STR2 + GPR1*8+48]
-movapd    FPR12, [STR3 + GPR1*8+48]
-mulpd     FPR1, FPR5
-addpd     FPR1, FPR6
-mulpd     FPR2, FPR7
-addpd     FPR2, FPR8
-mulpd     FPR3, FPR9
-addpd     FPR3, FPR10
-mulpd     FPR4, FPR11
-addpd     FPR4, FPR12
-movapd    [STR0 + GPR1*8], FPR1
-movapd    [STR0 + GPR1*8+16], FPR2
-movapd    [STR0 + GPR1*8+32], FPR3
-movapd    [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/triad_sse.ptt b/bench/x86-64/triad_sse.ptt
new file mode 100644
index 0000000..11aabe3
--- /dev/null
+++ b/bench/x86-64/triad_sse.ptt
@@ -0,0 +1,28 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 30
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+mulpd     FPR1, [STR2 + GPR1*8]
+addpd     FPR1, [STR3 + GPR1*8]
+mulpd     FPR2, [STR2 + GPR1*8+16]
+addpd     FPR2, [STR3 + GPR1*8+16]
+mulpd     FPR3, [STR2 + GPR1*8+32]
+addpd     FPR3, [STR3 + GPR1*8+32]
+mulpd     FPR4, [STR2 + GPR1*8+48]
+addpd     FPR4, [STR3 + GPR1*8+48]
+movaps    [STR0 + GPR1*8], FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
+
diff --git a/bench/x86-64/triad_sse_fma.ptt b/bench/x86-64/triad_sse_fma.ptt
new file mode 100644
index 0000000..d6822fa
--- /dev/null
+++ b/bench/x86-64/triad_sse_fma.ptt
@@ -0,0 +1,27 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), optimized for SSE FMAs
+LOADS 3
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 19
+UOPS 26
+LOOP 8
+movaps    FPR1, [STR1 + GPR1*8]
+movaps    FPR2, [STR1 + GPR1*8+16]
+movaps    FPR3, [STR1 + GPR1*8+32]
+movaps    FPR4, [STR1 + GPR1*8+48]
+movaps    FPR5, [STR2 + GPR1*8]
+movaps    FPR6, [STR2 + GPR1*8+16]
+movaps    FPR7, [STR2 + GPR1*8+32]
+movaps    FPR8, [STR2 + GPR1*8+48]
+vfmadd213pd FPR1, FPR5, [STR3 + GPR1*8]
+vfmadd213pd FPR2, FPR6, [STR3 + GPR1*8+16]
+vfmadd213pd FPR3, FPR7, [STR3 + GPR1*8+32]
+vfmadd213pd FPR4, FPR8, [STR3 + GPR1*8+48]
+movaps    [STR0 + GPR1*8], FPR1
+movaps    [STR0 + GPR1*8+16], FPR2
+movaps    [STR0 + GPR1*8+32], FPR3
+movaps    [STR0 + GPR1*8+48], FPR4
diff --git a/bench/x86-64/update.ptt b/bench/x86-64/update.ptt
index ac1129b..422981e 100644
--- a/bench/x86-64/update.ptt
+++ b/bench/x86-64/update.ptt
@@ -2,14 +2,20 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
-LOOP 8
-movaps    FPR1, [STR0 + GPR1 * 8]
-movaps    [STR0 + GPR1 * 8]     , FPR1
-movaps    FPR2, [STR0 + GPR1 * 8 + 16]
-movaps    FPR3, [STR0 + GPR1 * 8 + 32]
-movaps    FPR4, [STR0 + GPR1 * 8 + 48]
-movaps    [STR0 + GPR1 * 8 + 16], FPR2
-movaps    [STR0 + GPR1 * 8 + 32], FPR3
-movaps    [STR0 + GPR1 * 8 + 48], FPR4
+DESC Double-precision vector update, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
+LOOP 4
+movsd    FPR1, [STR0 + GPR1 * 8]
+movsd    [STR0 + GPR1 * 8]     , FPR1
+movsd    FPR2, [STR0 + GPR1 * 8 + 8]
+movsd    FPR3, [STR0 + GPR1 * 8 + 16]
+movsd    FPR4, [STR0 + GPR1 * 8 + 24]
+movsd    [STR0 + GPR1 * 8 + 8], FPR2
+movsd    [STR0 + GPR1 * 8 + 16], FPR3
+movsd    [STR0 + GPR1 * 8 + 24], FPR4
 
 
diff --git a/bench/x86-64/update_avx.ptt b/bench/x86-64/update_avx.ptt
index 2e9178e..eeca2fb 100644
--- a/bench/x86-64/update_avx.ptt
+++ b/bench/x86-64/update_avx.ptt
@@ -2,6 +2,12 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector update, optimized for AVX
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP 16
 vmovaps    ymm1, [STR0 + GPR1 * 8]
 vmovaps    [STR0 + GPR1 * 8]     , ymm1
diff --git a/bench/x86-64/update_plain.ptt b/bench/x86-64/update_plain.ptt
deleted file mode 100644
index b5a3e4a..0000000
--- a/bench/x86-64/update_plain.ptt
+++ /dev/null
@@ -1,15 +0,0 @@
-STREAMS 1
-TYPE DOUBLE
-FLOPS 0
-BYTES 16
-LOOP 4
-movss    FPR1, [STR0 + GPR1 * 8]
-movss    [STR0 + GPR1 * 8]     , FPR1
-movss    FPR2, [STR0 + GPR1 * 8 + 8]
-movss    FPR3, [STR0 + GPR1 * 8 + 16]
-movss    FPR4, [STR0 + GPR1 * 8 + 24]
-movss    [STR0 + GPR1 * 8 + 8], FPR2
-movss    [STR0 + GPR1 * 8 + 16], FPR3
-movss    [STR0 + GPR1 * 8 + 24], FPR4
-
-
diff --git a/bench/x86-64/update_sse.ptt b/bench/x86-64/update_sse.ptt
index ac1129b..fe1be1d 100644
--- a/bench/x86-64/update_sse.ptt
+++ b/bench/x86-64/update_sse.ptt
@@ -2,6 +2,12 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector update, optimized for SSE
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 LOOP 8
 movaps    FPR1, [STR0 + GPR1 * 8]
 movaps    [STR0 + GPR1 * 8]     , FPR1
diff --git a/bench/x86-64/vtriad_avx.ptt b/bench/x86-64/vtriad_avx.ptt
deleted file mode 100644
index 4a542d2..0000000
--- a/bench/x86-64/vtriad_avx.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 16
-vmovaps    ymm1, [STR1 + GPR1*8]
-vmovaps    ymm2, [STR1 + GPR1*8+32]
-vmovaps    ymm3, [STR1 + GPR1*8+64]
-vmovaps    ymm4, [STR1 + GPR1*8+96]
-vmulpd    ymm1, ymm1, [STR2 + GPR1*8]
-vaddpd    ymm1, ymm1, [STR3 + GPR1*8]
-vmulpd    ymm2, ymm2, [STR2 + GPR1*8+32]
-vaddpd    ymm2, ymm2, [STR3 + GPR1*8+32]
-vmulpd    ymm3, ymm3, [STR2 + GPR1*8+64]
-vaddpd    ymm3, ymm3, [STR3 + GPR1*8+64]
-vmulpd    ymm4, ymm4, [STR2 + GPR1*8+96]
-vaddpd    ymm4, ymm4, [STR3 + GPR1*8+96]
-vmovaps    [STR0 + GPR1*8], ymm1
-vmovaps    [STR0 + GPR1*8+32], ymm2
-vmovaps    [STR0 + GPR1*8+64], ymm3
-vmovaps    [STR0 + GPR1*8+96], ymm4
-
diff --git a/bench/x86-64/vtriad_mem_avx.ptt b/bench/x86-64/vtriad_mem_avx.ptt
deleted file mode 100644
index 315ef14..0000000
--- a/bench/x86-64/vtriad_mem_avx.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 4
-vmovaps    ymm1, [STR1 + GPR1*8]
-vmulpd     ymm1, ymm1, [STR2 + GPR1*8]
-vaddpd     ymm1, ymm1, [STR3 + GPR1*8]
-vmovntpd   [STR0 + GPR1*8], ymm1
-
diff --git a/bench/x86-64/vtriad_mem_sse.ptt b/bench/x86-64/vtriad_mem_sse.ptt
deleted file mode 100644
index 7c24748..0000000
--- a/bench/x86-64/vtriad_mem_sse.ptt
+++ /dev/null
@@ -1,10 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 2
-movaps    FPR1, [STR1 + GPR1*8]
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-movntpd   [STR0 + GPR1*8], FPR1
-
diff --git a/bench/x86-64/vtriad_plain.ptt b/bench/x86-64/vtriad_plain.ptt
deleted file mode 100644
index 120331c..0000000
--- a/bench/x86-64/vtriad_plain.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 4
-movsd    FPR1, [STR1 + GPR1*8]
-movsd    FPR2, [STR1 + GPR1*8+8]
-movsd    FPR3, [STR1 + GPR1*8+16]
-movss    FPR4, [STR1 + GPR1*8+24]
-mulsd     FPR1, [STR2 + GPR1*8]
-addsd     FPR1, [STR3 + GPR1*8]
-mulsd     FPR2, [STR2 + GPR1*8+8]
-addsd     FPR2, [STR3 + GPR1*8+8]
-mulsd     FPR3, [STR2 + GPR1*8+16]
-addsd     FPR3, [STR3 + GPR1*8+16]
-mulsd     FPR4, [STR2 + GPR1*8+24]
-addsd     FPR4, [STR3 + GPR1*8+24]
-movsd    [STR0 + GPR1*8], FPR1
-movsd    [STR0 + GPR1*8+8], FPR2
-movsd    [STR0 + GPR1*8+16], FPR3
-movsd    [STR0 + GPR1*8+24], FPR4
-
diff --git a/bench/x86-64/vtriad_sse.ptt b/bench/x86-64/vtriad_sse.ptt
deleted file mode 100644
index d521aa0..0000000
--- a/bench/x86-64/vtriad_sse.ptt
+++ /dev/null
@@ -1,22 +0,0 @@
-STREAMS 4
-TYPE DOUBLE
-FLOPS 2
-BYTES 32
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-addpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-addpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-addpd     FPR4, [STR3 + GPR1*8+48]
-movaps    [STR0 + GPR1*8], FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86/copy.ptt b/bench/x86/copy.ptt
index 111d38b..737b087 100644
--- a/bench/x86/copy.ptt
+++ b/bench/x86/copy.ptt
@@ -2,17 +2,23 @@ STREAMS 2
 TYPE DOUBLE
 FLOPS 0
 BYTES 16
+DESC Double-precision vector copy, only scalar operations
+LOADS 1
+STORES 1
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 14
 mov    GPR6, ARG1
 mov    GPR2, STR0
 mov    GPR3, STR1
-LOOP 8
-movaps    FPR1, [GPR2 + GPR1 * 8]
-movaps    FPR2, [GPR2 + GPR1 * 8 + 16]
-movaps    FPR3, [GPR2 + GPR1 * 8 + 32]
-movaps    FPR4, [GPR2 + GPR1 * 8 + 48]
-movaps    [GPR3 + GPR1 * 8]     , FPR1
-movaps    [GPR3 + GPR1 * 8 + 16], FPR2
-movaps    [GPR3 + GPR1 * 8 + 32], FPR3
-movaps    [GPR3 + GPR1 * 8 + 48], FPR4
+LOOP 4
+movsd     FPR1, [GPR2 + GPR1 * 8]
+movsd     FPR2, [GPR2 + GPR1 * 8 + 8]
+movsd     FPR3, [GPR2 + GPR1 * 8 + 16]
+movsd     FPR4, [GPR2 + GPR1 * 8 + 24]
+movsd     [GPR3 + GPR1 * 8]     , FPR1
+movsd     [GPR3 + GPR1 * 8 + 8], FPR2
+movsd     [GPR3 + GPR1 * 8 + 16], FPR3
+movsd     [GPR3 + GPR1 * 8 + 24], FPR4
 
 
diff --git a/bench/x86/load.ptt b/bench/x86/load.ptt
index cf001a4..473d8aa 100644
--- a/bench/x86/load.ptt
+++ b/bench/x86/load.ptt
@@ -2,12 +2,21 @@ STREAMS 1
 TYPE DOUBLE
 FLOPS 0
 BYTES 8
+DESC Double-precision load, only scalar operations
+LOADS 1
+STORES 0
+INSTR_CONST 16
+INSTR_LOOP 11
+UOPS 10
 mov GPR6, ARG1
 mov GPR2, STR0
 LOOP 8
-movaps    FPR1, [GPR2 + GPR1 * 8]
-movaps    FPR2, [GPR2 + GPR1 * 8 + 16]
-movaps    FPR3, [GPR2 + GPR1 * 8 + 32]
-movaps    FPR4, [GPR2 + GPR1 * 8 + 48]
-
+movsd    FPR1, [GPR2 + GPR1 * 8]
+movsd    FPR2, [GPR2 + GPR1 * 8 + 8]
+movsd    FPR3, [GPR2 + GPR1 * 8 + 16]
+movsd    FPR4, [GPR2 + GPR1 * 8 + 24]
+movsd    FPR5, [GPR2 + GPR1 * 8 + 32]
+movsd    FPR6, [GPR2 + GPR1 * 8 + 40]
+movsd    FPR7, [GPR2 + GPR1 * 8 + 48]
+movsd    FPR8, [GPR2 + GPR1 * 8 + 56]
 
diff --git a/bench/x86/store.ptt b/bench/x86/store.ptt
index 1cf15da..07ed59a 100644
--- a/bench/x86/store.ptt
+++ b/bench/x86/store.ptt
@@ -1,16 +1,22 @@
 STREAMS 1
 TYPE DOUBLE
 FLOPS 0
+DESC Double-precision store, only scalar operations
 BYTES 8
-movaps FPR1, [SCALAR]
-movaps FPR2, [SCALAR]
-movaps FPR3, [SCALAR]
-movaps FPR4, [SCALAR]
+LOADS 0
+STORES 1
+INSTR_CONST 22
+INSTR_LOOP 7
+UOPS 10
+movsd FPR1, [SCALAR]
+movsd FPR2, [SCALAR]
+movsd FPR3, [SCALAR]
+movsd FPR4, [SCALAR]
 mov    GPR6, ARG1
 mov  GPR2, STR0
-LOOP 8
-movaps    [GPR2 + GPR1 * 8]     , FPR1
-movaps    [GPR2 + GPR1 * 8 + 16], FPR2
-movaps    [GPR2 + GPR1 * 8 + 32], FPR3
-movaps    [GPR2 + GPR1 * 8 + 48], FPR4
+LOOP 4
+movsd    [GPR2 + GPR1 * 8]     , FPR1
+movsd    [GPR2 + GPR1 * 8 + 8], FPR2
+movsd    [GPR2 + GPR1 * 8 + 16], FPR3
+movsd    [GPR2 + GPR1 * 8 + 24], FPR4
 
diff --git a/bench/x86/stream.ptt b/bench/x86/stream.ptt
index bab4ecb..067a06c 100644
--- a/bench/x86/stream.ptt
+++ b/bench/x86/stream.ptt
@@ -2,26 +2,32 @@ STREAMS 3
 TYPE DOUBLE
 FLOPS 2
 BYTES 24
-movaps FPR5, [SCALAR]
-mov    GPR6, ARG1
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), only scalar operations
+LOADS 2
+STORES 1
+INSTR_CONST 21
+INSTR_LOOP 19
+UOPS 26
+movsd FPR5, [SCALAR]
+mov  GPR6, ARG1
 mov  GPR2, STR0
 mov  GPR3, STR1
 mov  GPR4, STR2
 LOOP 8
-movaps    FPR1, [GPR3 + GPR1*8]
-movaps    FPR2, [GPR3 + GPR1*8+16]
-movaps    FPR3, [GPR3 + GPR1*8+32]
-movaps    FPR4, [GPR3 + GPR1*8+48]
-mulpd     FPR1, FPR5
-addpd     FPR1, [GPR4 + GPR1*8]
-mulpd     FPR2, FPR5
-addpd     FPR2, [GPR4 + GPR1*8+16]
-mulpd     FPR3, FPR5
-addpd     FPR3, [GPR4 + GPR1*8+32]
-mulpd     FPR4, FPR5
-addpd     FPR4, [GPR4 + GPR1*8+48]
-movaps    [GPR2 + GPR1*8]   , FPR1
-movaps    [GPR2 + GPR1*8+16], FPR2
-movaps    [GPR2 + GPR1*8+32], FPR3
-movaps    [GPR2 + GPR1*8+48], FPR4
+movsd    FPR1, [GPR3 + GPR1*8]
+movsd    FPR2, [GPR3 + GPR1*8+8]
+movsd    FPR3, [GPR3 + GPR1*8+16]
+movsd    FPR4, [GPR3 + GPR1*8+24]
+mulsd    FPR1, FPR5
+addsd    FPR1, [GPR4 + GPR1*8]
+mulsd    FPR2, FPR5
+addsd    FPR2, [GPR4 + GPR1*8+8]
+mulsd    FPR3, FPR5
+addsd    FPR3, [GPR4 + GPR1*8+16]
+mulsd    FPR4, FPR5
+addsd    FPR4, [GPR4 + GPR1*8+24]
+movsd    [GPR2 + GPR1*8],    FPR1
+movsd    [GPR2 + GPR1*8+8],  FPR2
+movsd    [GPR2 + GPR1*8+16], FPR3
+movsd    [GPR2 + GPR1*8+24], FPR4
 
diff --git a/config.mk b/config.mk
index 2c3f3be..46fbe78 100644
--- a/config.mk
+++ b/config.mk
@@ -1,6 +1,6 @@
 # Please have a look in INSTALL and the WIKI for details on
 # configuration options setup steps.
-# supported: GCC, GCCX86, MIC (ICC)
+# supported: GCC, CLANG, ICC, MIC (ICC), GCCX86 (for 32bit systems)
 COMPILER = GCC#NO SPACE
 
 # Define the color of the likwid-pin output
@@ -10,12 +10,41 @@ COLOR = BLUE#NO SPACE
 
 # Path were to install likwid
 PREFIX = /usr/local#NO SPACE
+
+#################################################################
+# Common users do not need to change values below this comment! #
+#################################################################
+
 MANPREFIX = $(PREFIX)/man#NO SPACE
+BINPREFIX = $(PREFIX)/bin#NO SPACE
+LIBPREFIX = $(PREFIX)/lib#NO SPACE
+
+# These paths are hardcoded into executables and libraries. Usually
+# they'll be the same as above, but package maintainers may want to
+# distinguish between the image directories and the final install
+# target.
+# Keep in mind that the access and setFreq daemon need enough
+# privileges that may be deleted when copying the files to
+# the INTSTALLED_PREFIX
+INSTALLED_PREFIX = $(PREFIX)#NO SPACE
+INSTALLED_BINPREFIX = $(INSTALLED_PREFIX)/bin#NO SPACE
+INSTALLED_LIBPREFIX = $(INSTALLED_PREFIX)/lib#NO SPACE
+
+# chown installed tools to this user/group
+# if you change anything here, make sure that the user/group can access
+# the MSR devices and (on Intel) the PCI devices.
+INSTALL_CHOWN = -g root -o root
 
 # For the daemon based secure msr/pci access configure
 # the absolute path to the msr daemon executable.
-# Usually you can leave this to the default.
+# $(INSTALLED_PREFIX)/bin/likwid-accessD
 ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
+INSTALLED_ACCESSDAEMON = $(INSTALLED_PREFIX)/sbin/likwid-accessD#NO SPACE
+
+# Build the accessDaemon. Have a look in the WIKI for details.
+BUILDDAEMON = true#NO SPACE
+#Build the setFrequencies tool
+BUILDFREQ = true#NO SPACE
 
 # Set the default mode for MSR access.
 # This can usually be overriden on the commandline.
@@ -23,7 +52,7 @@ ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
 ACCESSMODE = accessdaemon#NO SPACE
 
 # Change to true to a build shared library instead of a static one
-SHARED_LIBRARY = false#NO SPACE
+SHARED_LIBRARY = true#NO SPACE
 
 # Build Fortran90 module interface for marker API. Adopt Fortran compiler
 # in ./make/include_<COMPILER>.mk if necessary. Default: ifort .
@@ -32,17 +61,25 @@ FORTRAN_INTERFACE = false#NO SPACE
 # Instrument likwid-bench for use with likwid-perfctr
 INSTRUMENT_BENCH = false#NO SPACE
 
-# Usually you do not need to edit below
+# Use recommended Portable Hardware Locality (hwloc) instead of CPUID
+USE_HWLOC = true#NO SPACE
+
+# Build LIKWID with debug flags
+DEBUG = false#NO SPACE
+
+# Basic configuration (compiled into library, can be changed by creating
+# a proper config file at CFG_FILE_PATH)
 MAX_NUM_THREADS = 263
-MAX_NUM_NODES = 4
-HASH_TABLE_SIZE = 20
+MAX_NUM_NODES = 64
 CFG_FILE_PATH = /etc/likwid.cfg
+TOPO_FILE_PATH = /etc/likwid_topo.cfg
 
 # Versioning Information
-VERSION = 3
+VERSION = 4
 RELEASE = 1
-DATE    = 12.2.2014
-
-LIBLIKWIDPIN = $(abspath $(PREFIX)/lib/liblikwidpin.so)
-LIKWIDFILTERPATH = $(abspath $(PREFIX)/share/likwid)
+DATE    = 19.05.2016
 
+RPATHS = -Wl,-rpath=$(INSTALLED_LIBPREFIX)
+LIBLIKWIDPIN = $(abspath $(INSTALLED_PREFIX)/lib/liblikwidpin.so.$(VERSION).$(RELEASE))
+LIKWIDFILTERPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/filter)
+LIKWIDGROUPPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/perfgroups)
diff --git a/doc/Doxyfile b/doc/Doxyfile
new file mode 100644
index 0000000..dbfba97
--- /dev/null
+++ b/doc/Doxyfile
@@ -0,0 +1,1781 @@
+# Doxyfile 1.7.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = "LIKWID"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           = doc/logo.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = NO
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ./src/includes/likwid.h ./doc/likwid-doxygen.md ./src/includes/perfmon_types.h ./src/includes/topology_types.h  ./src/includes/power_types.h ./src/includes/tree_types.h ./doc/archs/ ./doc/lua-doxygen.md ./doc/applications/ ./doc/likwid.cfg.md ./src/likwid.f90
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.md
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = AccessDataRecord
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           = ./examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = doc/html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/doc/applications/likwid-accessD.md b/doc/applications/likwid-accessD.md
new file mode 100644
index 0000000..c80481e
--- /dev/null
+++ b/doc/applications/likwid-accessD.md
@@ -0,0 +1,55 @@
+/*! \page likwid-accessD <CODE>likwid-accessD</CODE>
+
+<H1>Information</H1>
+
+<CODE>likwid-accessD</CODE> is a command line application that opens a UNIX file socket and waits for access
+operations from LIKWID tools that require access to the MSR and PCI device
+files. The MSR and PCI device files are commonly only accessible for users with root
+privileges, therefore <CODE>likwid-accessD</CODE> requires the suid-bit set or a suitable libcap setting.
+Depending on the current system architecture, <CODE>likwid-accessD</CODE> permits only access to registers defined for the architecture.
+
+<!--<H1>Security concerns</H1>
+The <CODE>likwid-accessD</CODE> is a critical part of LIKWID. The accesses to the MSR and often also PCI devices are restricted to users with root privileges. In order to allow users the access to the MSR/PCI devices, the users have to get temporarily elevated privileges. There are currently two ways of achieving this in the Linux operating system. The convenient method are the suid/guid bits that allow an application to execute with the privileges of the owner (suid) or group (guid). Th [...]
+Both methods should be safe but there are exploits for the MSR devices, general suid applications and the <CODE>cap_sys_rawio</CODE>. We checked all exploits we found and built the access daemon so that it is not vulnerable for the exploits. By restricting the accessible registers and closing all file handles -->
+
+<H1>Build</H1>
+The building of <CODE>likwid-accessD</CODE> can be controlled through the <CODE>config.mk</CODE> file. Depending on the variable <CODE>BUILDDAEMON</CODE> the daemon code is built or not. The path to <CODE>likwid-accessD</CODE> is compiled into the LIKWID library, so if you want to use the access daemon from an uncommon path, you have to set the <CODE>ACCESSDAEMON</CODE> variable.
+
+<H1>Setup</H1>
+In order to allow <CODE>likwid-accessD</CODE> to run with elevated priviledges, there are three ways
+<UL>
+<LI>SUID Method:<BR>
+<CODE>
+root: # chown root:root likwid-accessD<BR>
+root: # chmod u+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>GUID Method: (PCI devices cannot be accesses with this method but we are working on it)<BR>
+<CODE>
+root: # groupadd likwid<BR>
+root: # chown root:likwid likwid-accessD<BR>
+root: # chmod g+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>Libcap Method:<BR>
+<CODE>
+root: # setcap cap_sys_rawio+ep likwid-accessD
+</CODE>
+</LI>
+</UL>
+There are Linux distributions where settings the suid permission on <CODE>likwid-accessD</CODE> is not enough. Try also to set the capabilities for <CODE>likwid-accessD</CODE>. 
+
+<H1>Protocol</H1>
+Every likwid instance will start its own daemon. This client-server pair will communicate with a socket file in <CODE>/tmp</CODE>  named <CODE>likwid-$PID</CODE>. The daemon only accepts one connection. As soon as the connect is successful the socket file will be deleted.
+
+From there the communication consists of write read pairs issued from the client. The daemon will ensure allowed register ranges relevant for the likwid applications. Other register access will be silently dropped and logged to <CODE>syslog</CODE>.
+
+On shutdown the client will terminate the daemon with a exit message.
+
+The daemon has the following error handling:
+<UL>
+<LI>To prevent daemons not stopped correctly the daemon has a timeout on startup.</LI>
+<LI>If the client prematurely disconnects the daemon terminates.</LI>
+<LI>If the client disconnects between a read and write the daemon catches <CODE>SIGPIPE</CODE>  and disconnects.</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-agent.md b/doc/applications/likwid-agent.md
new file mode 100644
index 0000000..44cbb65
--- /dev/null
+++ b/doc/applications/likwid-agent.md
@@ -0,0 +1,94 @@
+/*! \page likwid-agent <CODE>likwid-agent</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-agent</CODE> is a daemon application that uses \ref likwid-perfctr to measure hardware performance counters and write them to various output back-ends. The basic configuration is in a global configuration file that must be given on command line. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output  [...]
+
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+  <TH>Option
+                                             
+</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>GROUPPATH <path></TD>
+  <TD>Path to the group files containing event set and output definitions. See section <B>Group files</B> for information.</TD>
+</TR>
+<TR>
+  <TD>EVENTSET <group1> <group2> ...</TD>
+  <TD>Space separated list of groups (without .txt) that should be monitored.</TD>
+</TR>
+<TR>
+  <TD>DURATION <time></TD>
+  <TD>Measurement duration in seconds for each group.</TD>
+</TR>
+<TR>
+  <TD>LOGPATH <path></TD>
+  <TD>Sets the output logfile path for the measured data. Each monitoring group logs to its own file likwid.<group>.log</TD>
+</TR>
+<TR>
+  <TD>LOGSTYLE <update/log></TD>
+  <TD>Specifies whether new data should be appended to the files (log) or the file should be emptied first (update).<BR> Update is a common option if you read in the data afterwards by some monitoring tool like cacti, nagios, ... Default is log</TD>
+</TR>
+<TR>
+  <TD>GMETRIC <True/False></TD>
+  <TD>Activates the output to gmetric.</TD>
+</TR>
+<TR>
+  <TD>GMETRICPATH <path></TD>
+  <TD>Set path to the gmetric executable.</TD>
+</TR>
+<TR>
+  <TD>GMETRICCONFIG <path></TD>
+  <TD>Set path to a custom gmetric config file.</TD>
+</TR>
+<TR>
+  <TD>RRD <True/False></TD>
+  <TD>Activates the output to RRD files (Round Robin Database).</TD>
+</TR>
+<TR>
+  <TD>RRDPATH <path></TD>
+  <TD>Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.</TD>
+</TR>
+<TR>
+  <TD>SYSLOG <True/False></TD>
+  <TD>Activates the output to system log using logger.</TD>
+</TR>
+<TR>
+  <TD>SYSLOGPRIO <prio></TD>
+  <TD>Set the priority for the system log. The default priority is 'local0.notice'.</TD>
+</TR>
+</TABLE>
+
+<H1>Group files</H1>
+The group files are adapted performance group files as used by <CODE>likwid-perfctr</CODE>.
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is <CODE><GROUPPATH>/<SHORT_ARCH_NAME>/</CODE> with <SHORT_ARCH_NAME> similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+
+
+<TABLE>
+<TR>
+  <TH>Option
+                                            
+</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>SHORT <string></TD>
+  <TD>A short descriptive information about the group.</TD>
+</TR>
+<TR>
+  <TD>EVENTSET<BR><counter1> <event1><BR><counter2>:<option1>:<option2> <event2></TD>
+  <TD>Definition of the eventset similar to the performance groups. See performance_groups for details.</TD>
+</TR>
+<TR>
+  <TD>METRICS<BR><metricname> <formula><BR><filter> <metricname> <formula></TD>
+  <TD>Definition of the output metrics. The syntax follows the METRICS definition of the performance groups as used by \ref likwid-perfctr . If no function is set at the beginning of the line, <formula> is evaluated for every CPU and send to the output back-ends. The <metricname> gets the prefix "T<cpuid> ". To avoid writing to much data to the back-ends, the data can be reduced by <filter>. The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter [...]
+</TR>
+
+</TABLE>
+
+<H1>Notice</H1>
+There is currently no predefined init script for <CODE>likwid-agent</CODE>, you have to create it yourself for your distribution.
+*/
diff --git a/doc/applications/likwid-bench.md b/doc/applications/likwid-bench.md
new file mode 100644
index 0000000..fc642e1
--- /dev/null
+++ b/doc/applications/likwid-bench.md
@@ -0,0 +1,93 @@
+/*! \page likwid-bench <CODE>likwid-bench</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-bench</CODE> is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
+<CODE>likwid-bench</CODE> includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by <CODE>likwid-bench</CODE> or measured using hardware performance counters by using \ref likwid-perfctr as a wrapper to <CODE>likwid-bench</CODE>. This requires to build <CODE>likwid-bench</CODE> with instrumentation enabled in config.mk (<CODE>INSTRUMENT_BENCH</CODE>).
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-a</TD>
+  <TD>List all available benchmarks</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>List all available thread affinity domains</TD>
+</TR>
+<TR>
+  <TD>-d <delim></TD>
+  <TD>Use <delim> instead of ',' for the output of -p</TD>
+</TR>
+<TR>
+  <TD>-l <test></TD>
+  <TD>List characteristics of <test> like number of streams, data used per loop iteration, ...</TD>
+</TR>
+<TR>
+  <TD>-t <test></TD>
+  <TD>Perform assembly benchmark <test></TD>
+</TR>
+<TR>
+  <TD>-s <min_time></TD>
+  <TD>Minimal time in seconds to run the benchmark.<BR>Using this time, the iteration count is determined automatically to provide reliable results. Default is 1. If the determined iteration count is below 10, it is normalized to 10.</TD>
+</TR>
+<TR>
+  <TD>-w <workgroup></TD>
+  <TD>Set a workgroup for the benchmark. A workgroup can have different formats:<BR>
+  <TABLE>
+    <TR>
+      <TH>Format</TH>
+      <TH>Description</TH>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts as many threads as available in affinity domain <affinity_domain></TD>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size>:<num_threads></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain></TD>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size>:<num_threads>:<chunk_size>:<stride></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain> with <chunk_size> selected in row and a distance of <stride>.<BR>See \ref CPU_expressions on the \ref likwid-pin page for further information.</TD>
+    </TR>
+    <TR>
+      <TD><above_formats>-<streamID>:<stream_domain></TD>
+      <TD>In combination with every above mentioned format, the test streams (arrays, vectors) can be place in different affinity domains than the threads.<BR>This can be achieved by adding a stream placement option -<streamID>:<stream_domain> for all streams of the test to the workgroup definition.<BR>The stream with <streamID> is placed in affinity domain <stream_domain>.<BR>The amount of streams of a test can be determined with the -l <test> commandline o [...]
+    </TR>
+  </TD>
+  </TABLE>
+</TR>
+</TABLE>
+
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-bench -t copy -w S0:100kB</CODE><BR>
+Run test <CODE>copy</CODE> using all threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The iteration count is calculated automatically.
+</LI>
+<LI><CODE>likwid-bench -t triad -i 100 -w S0:1GB:2:1:2</CODE><BR>
+Run test <CODE>triad</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE>. Assuming <CODE>S0 = 0,4,1,5</CODE> the threads are pinned to CPUs 0 and 1, hence skipping of one thread during selection. The streams of the <CODE>triad</CODE> benchmark sum up to <CODE>1GB</CODE> placed in affinity domain <CODE>S0</CODE>. The number of iteration is explicitly set to <CODE>100</CODE>
+</LI>
+<LI><CODE>likwid-bench -t update -w S0:100kB -w S1:100kB</CODE><BR>
+Run test <CODE>update</CODE> using all threads in affinity domain <CODE>S0</CODE> and <CODE>S1</CODE>. The threads scheduled on <CODE>S0</CODE> use stream that sum up to <CODE>100kB</CODE>. Similar to <CODE>S1</CODE> the threads are placed there working only on their socket-local streams. The results of both workgroups are combined.
+</LI>
+<LI><CODE>likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB:4</CODE><BR>
+Run test <CODE>update</CODE> using <CODE>4</CODE> threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The benchmark execution is measured using the \ref Marker_API. It measures the <CODE>MEM</CODE> performance group on the first four CPUs of the <CODE>S0</CODE> affinity domain. For further information about hardware performance counters see \ref likwid-perfctr<BR [...]
+</LI>
+<LI><CODE>likwid-bench -t copy -w S0:1GB:2:1:2-0:S1,1:S1</CODE><BR>
+Run test <CODE>copy</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE> skipping one thread during selection. The two streams used in the <CODE>copy</CODE> benchmark have the IDs 0 and 1 and a summed up size of <CODE>1GB</CODE>. Both streams are placed in affinity domain <CODE>S1</CODE>.
+</LI>
+</UL>
+
+
+
+*/
diff --git a/doc/applications/likwid-genTopoCfg.md b/doc/applications/likwid-genTopoCfg.md
new file mode 100644
index 0000000..ae758c8
--- /dev/null
+++ b/doc/applications/likwid-genTopoCfg.md
@@ -0,0 +1,29 @@
+/*! \page likwid-genTopoCfg <CODE>likwid-genTopoCfg</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-genTopoCfg</CODE> is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of re-gathering all values. The path to the topology configuration can be set in the global LIKWID configuration file, see \ref likwid.cfg.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-o <file></TD>
+  <TD>Use <file> instead of the default output /etc/likwid-topo.cfg./TD>
+</TR>
+</TABLE>
+
+
+*/
+
diff --git a/doc/applications/likwid-memsweeper.md b/doc/applications/likwid-memsweeper.md
new file mode 100644
index 0000000..570c7cb
--- /dev/null
+++ b/doc/applications/likwid-memsweeper.md
@@ -0,0 +1,34 @@
+/*! \page likwid-memsweeper <CODE>likwid-memsweeper</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-memsweeper</CODE> is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-c <list></TD>
+  <TD>Sweeps the memory and LLC cache for NUMA domains listed in <list>.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-memsweeper -c 0,1</CODE><BR>
+Cleans the memory and LLC on NUMA nodes identified by the node IDs 0 and 1.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-mpirun.md b/doc/applications/likwid-mpirun.md
new file mode 100644
index 0000000..aee12d6
--- /dev/null
+++ b/doc/applications/likwid-mpirun.md
@@ -0,0 +1,83 @@
+/*! \page likwid-mpirun <CODE>likwid-mpirun</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-mpirun</CODE>
+A tool to start and monitor MPI applications with LIKWID. It can be used as supplement of the MPI implementations' startup programm like <CODE>mpirun</CODE> or <CODE>mpiexec</CODE> with some enhancements for pinning of OpenMP thread in hybrid jobs. Moreover, <CODE>likwid-mpirun</CODE> can insert calls to \ref likwid-perfctr to measure hardware performance counters for each MPI process and its threads, including Marker API. Since the <A HREF="http://modules.sourceforge.net/">modules</A> s [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-d, --debug</TD>
+  <TD>Print debug information</TD>
+</TR>
+<TR>
+  <TD>-n, -np, --n, --np <arg></TD>
+  <TD>Specify the number of processes for MPI</TD>
+</TR>
+<TR>
+  <TD>--nperdomain <domain>:<arg></TD>
+  <TD>Schedule <arg> MPI processes for each affinity domain starting with <domain>, e.g S:2 translates in two MPI processes per socket.<BR><CODE>likwid-mpirun</CODE> assumes that all participating hosts have the same topology.</TD>
+</TR>
+<TR>
+  <TD>--hostfile <file></TD>
+  <TD>Specify the file that should be used as hostfile.<BR>If not set, <CODE>likwid-mpirun</CODE> checks the <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> and <CODE>SLURM_HOSTFILE</CODE> environment variable</TD>
+</TR>
+<TR>
+  <TD>--pin <expr></TD>
+  <TD>For hybrid pinning specify the thread pinning expression for each MPI process.<BR>The format is similar to \ref CPU_expressions separated by '_' for multiple processes.<BR>If -np is not set, the number of MPI processes is calculated using the pinning expressions.</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+  <TD>--mpi <mpitype></TD>
+  <TD>Specify the type of the MPI implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the MPI implementation from the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intelmpi</B>, <B>openmpi</B> and <B>mvapich2</B>.</TD>
+</TR>
+<TR>
+  <TD>--omp <omptype></TD>
+  <TD>Specify the type of OpenMP implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the OpenMP implementation using <I>ldd</I> and the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intel</B> and <B>gnu</B></TD>
+</TR>
+<TR>
+  <TD>-g, --group <eventset></TD>
+  <TD>Use \ref likwid-perfctr to measure performance data for the MPI processes and OpenMP threads.<BR><eventset> can be either a performance group or a custom event string.<BR>For details see \ref performance_groups.</TD>
+</TR>
+<TR>
+  <TD>-m, --marker</TD>
+  <TD>Activate the \ref Marker_API for the measurements with \ref likwid-perfctr.</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print results in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>)</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-mpirun -np 32 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 32 MPI processes distributed over the hosts in <CODE>PBS_NODEFILE</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:1 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using one MPI process per socket over the hosts in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE>.<BR>The total amount of processes is calculated by <numberOfSocketDomains> * <processCountPerDomain> * <hostsInHostfile>
+</LI>
+<LI><CODE>likwid-mpirun --hostfile host.list -pin S0:2_S1:2 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using two MPI processes per host in <CODE>host.list</CODE>.<BR>The first MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S0</CODE>,<BR>the second MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S1</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:2 -g MEM ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 2 MPI processes per socket on each host in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE> and measure the <CODE>MEM</CODE> performance group<BR>
+Only one process per socket measures the Uncore/RAPL counters, the other one(s) only core-local counters.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-perfctr.md b/doc/applications/likwid-perfctr.md
new file mode 100644
index 0000000..9efc789
--- /dev/null
+++ b/doc/applications/likwid-perfctr.md
@@ -0,0 +1,260 @@
+/*! \page likwid-perfctr <CODE>likwid-perfctr</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfctr</CODE> is a lightweight command line application to configure and read out hardware performance monitoring data
+on supported x86 processors. It can measure either as wrapper without changing the measured application
+or with \ref Marker_API functions inside the code, which will turn on and off the counters. Moreover, there are the timeline and stethoscope mode.
+There are preconfigured performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The \ref Marker_API can measure mulitple named regions and the results are accumulated over multiple region calls.
+<P>
+<B>Note</B> that <CODE>likwid-perfctr</CODE> measures all events on the specified CPUs and not only the context of the executable. On a highly loaded system it will be hard to determine which part of the given application caused the counter increment. Moreover, it is necessary to ensure that processes and threads are pinned to dedicated resources. You can either pin the application yourself or use the builtin pin functionality.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-i, --info</TD>
+  <TD>Print \a CPUID information about processor and about Intel Performance Monitoring features.</TD>
+</TR>
+<TR>
+  <TD>-g, --group <arg></TD>
+  <TD>Specify which event string or performance group should be measured.</TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-C <arg></TD>
+  <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-H</TD>
+  <TD>Print information about a performance group given with -g, --group option.</TD>
+</TR>
+<TR>
+  <TD>-m</TD>
+  <TD>Run in marker API mode</TD>
+</TR>
+<TR>
+  <TD>-a</TD>
+  <TD>Print available performance groups for current processor.</TD>
+</TR>
+<TR>
+  <TD>-e</TD>
+  <TD>Print available counters and performance events and suitable options of current processor.</TD>
+</TR>
+<TR>
+  <TD>-E <pattern></TD>
+  <TD>Print available performance events matching <pattern> and print the usable counters for the found events.<BR>The matching is done with *<pattern>*, so all events matching the substring are returned.</TD>
+</TR>
+<TR>
+  <TD>-o, --output <file></TD>
+  <TD>Store all ouput to file instead of stdout. LIKWID enables the reformatting of output files according to their suffix.<BR>You can place additional output formatters in folder <CODE><PREFIX>/share/likwid/filter</CODE>. LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>Moreover, there are substitutions possible in the  [...]
+</TR>
+<TR>
+  <TD>-S <time></TD>
+  <TD>Specify the time between starting and stopping of counters. Can be used to monitor applications. Option does not require an executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-t <time></TD>
+  <TD>Activates the timeline mode that reads the counters in the given frequency <time> during the whole run of the executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-T <time></TD>
+  <TD>If multiple event sets are given on commandline, switch every <time> to next group. Default is 2s.<BR>Examples for <time> are 1s, 250ms, 500us.<BR>If only a single event set is given, the default read frequency is 30s to catch overflows.</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>). The output contains some markers that help to parse the output.</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfctr -C 0-2 -g TLB ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and measure on the specified CPUs the performance group <CODE>TLB</CODE>. If not set, the environment variable <CODE>OMP_NUM_THREADS</CODE> is set to 3.
+</LI>
+<LI><CODE>likwid-perfctr  -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2,3,4 and measure on the specified CPUs the event set <CODE>INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3</CODE>.<BR>The event set consists of two event definitions:
+    <UL>
+    <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE></LI>
+    <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./a.out</CODE><BR>
+Run and pin executable <CODE>./a.out</CODE> on CPU 0 with a custom event set containing three events.<BR>The event set consists of three event definitions:
+    <UL>
+    <LI><CODE>INSTR_RETIRED_ANY:FIXC0</CODE> measures event <CODE>INSTR_RETIRED_ANY</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC0</CODE>.</LI>
+    <LI><CODE>CPU_CLK_UNHALTED_CORE:FIXC1</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC1</CODE>. This event can be used to calculate the run time of the application.</LI>
+    <LI><CODE>UNC_L3_LINES_IN_ANY:UPMC0</CODE> measures event <CODE>UNC_L3_LINES_IN_ANY</CODE> using Uncore counter register named <CODE>UPMC0</CODE>. Uncore counters are socket-specific, hence LIKWID reads the counter registers only on one CPU per socket.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -m -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Run and pin the executable to CPUs 0,1,2,3,4 and activate the Marker API. The code in <CODE>a.out</CODE> is assumed to be instrumented with LIKWID's Marker API. Only the marked code regions are measured.
+    <UL>
+    <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE>.</LI>
+    <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+    </UL>
+The Marker API for C/C++ offers 6 functions to measure named regions. You can use instrumented code with and without LIKWID. In order to activate the Marker API, <CODE>-DLIKWID_PERFMON</CODE> needs to be added to the compiler call. The following listing describes each function shortly (complete list see \ref Marker_API):
+    <UL>
+    <LI><CODE>LIKWID_MARKER_INIT</CODE>: Initialize LIKWID globally. Must be called in serial region and only once.</LI>
+    <LI><CODE>LIKWID_MARKER_THREADINIT</CODE>: Initialize LIKWID for each thread. Must be called in parallel region and executed by every thread.</LI>
+    <LI><CODE>LIKWID_MARKER_START('compute')</CODE>: Start a code region and associate it with the name 'compute'. The names are freely selectable and are used for grouping and outputting regions.</LI>
+    <LI><CODE>LIKWID_MARKER_STOP('compute')</CODE>: Stop the code region associated with the name 'compute'.</LI>
+    <LI><CODE>LIKWID_MARKER_SWITCH</CODE>: Switches to the next performance group or event set in a round-robin fashion. Can be used to measure the same region with multiple events. If called inside a code region, the results for all groups will be faulty. Be aware that each programming of the config registers causes overhead.</LI>
+    <LI><CODE>LIKWID_MARKER_CLOSE</CODE>: Finalize LIKWID globally. Should be called in the end of your application. This writes out all region results to a file that is picked up by <CODE>likwid-perfctr</CODE> for evaluation.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3  -g FLOPS_DP -t 300ms ./a.out 2> out.txt</CODE><BR>
+Runs the executable <CODE>a.out</CODE> and measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 every 300 ms. Since <CODE>-c</CODE> is used, the application is not pinned to the CPUs and <CODE>OMP_NUM_THREADS</CODE> is not set. The performance group <CODE>FLOPS_DP</CODE> is not available on every architecture, use <CODE>likwid-perfctr -a</CODE> for a complete list. Please note, that <CODE>likwid-perfctr</CODE> writes the measurements to stderr while the application's outp [...]
+The syntax of the timeline mode output lines is:<BR>
+<CODE><groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event1_Thread2> ... <EventN_ThreadN></CODE><BR>
+You can also use the tool \ref likwid-perfscope to print the measured values live with <CODE>gnuplot</CODE>.
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3  -g FLOPS_DP -S 2s</CODE><BR>
+Measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 for 2 seconds. This option can be used to measure application from external or to perform low-level system monitoring.
+</LI>
+
+<LI><CODE>likwid-perfctr -c S0:0\@S1:0  -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 -S 2s</CODE><BR>
+Measures the event <CODE> LLC_LOOKUPS_DATA_READ</CODE> on the first CPU of socket 0 and the first CPU on socket 1 for 2 seconds using the counter 0 in CBOX 0 (LLC cache coherency engine). The counting is filtered to only lookups in the 'invalid' and 'modified' state. Look at the microarchitecture Uncore documentation for possible bitmasks. Which option is available for which counter class can be found in section \ref Architectures.
+</LI>
+</UL>
+
+\anchor performance_groups
+<H1>Performance groups</H1>
+One of the outstanding features of LIKWID are the performance groups. Each microarchitecture has its own set of events and related counters and finding the suitable events in the documentation is tedious. Moreover, the raw results of the events are often not meaningful, they need to be combined with other events like run time or clock speed. LIKWID addresses those problems by providing performance groups that specify a set of events and counter combinations as well as a set of derived me [...]
+<B>Please note that performance groups is a feature of the Lua API and not available for the C/C++ API.</B>
+<H3>Directory structure</H3>
+While installation of LIKWID, the performance groups are copied to the path <CODE>${INSTALL_PREFIX}/share/likwid</CODE>. In this folder there is one subfolder per microarchitecture that contains all performance groups for that microarchitecture. The folder names are not freely selectable, they are defined in <CODE>src/topology.c</CODE>. For every microarchitecture at the time of release, there is already a folder that can be extended with your own performance groups. You can change the p [...]
+<H3>Syntax of performance group files</H3>
+<CODE>SHORT <string></CODE> // Short description of the performance group<BR>
+<BR>
+<CODE>EVENTSET</CODE> // Starts the event set definition<BR>
+<CODE><counter>(:<options>) <event></CODE> // Each line defines one event/counter combination with optional options.<BR>
+<CODE>FIXC0 INSTR_RETIRED_ANY</CODE> // Example<BR>
+<BR>
+<CODE>METRICS</CODE> // Starts the derived metric definitions<BR>
+<CODE><metricname> <formula></CODE> // Each line defines one derived metric. <CODE><metricname></CODE> can contain spaces, <CODE><formula></CODE> must be free of spaces. The counter names (with options) and the variables <CODE>time</CODE> and <CODE>inverseClock</CODE> can be used as variables in <CODE><formula></CODE>.
+<CODE>CPI  FIXC1/FIXC0</CODE> // Example<BR>
+<BR>
+<CODE>LONG</CODE> // Starts the detailed description of the performance group<BR>
+<CODE><TEXT></CODE> // <CODE><TEXT></CODE> is displayed with <CODE>-H</CODE> commandline option
+
+\anchor Marker_API
+<H1>Marker API</H1>
+The Marker API enables measurement of user-defined code regions in order to get deeper insight what is happening at a specific point in the application. The Marker API itself has 8 commands. In order to activate the Marker API, the code must be compiled with <CODE>-DLIKWID_PERFMON</CODE>. If the code is compiled without this define, the Marker API functions perform no operation and cause no overhead. You can also run code compiled with LIKWID_PERFMON defined without measurements but a me [...]
+Even pure serial applications have to call LIKWID_MARKER_THREADINIT to initialize the accessDaemon or the direct accesses.<BR>
+The names for the regions can be freely chosen but <I>whitespaces are not allowed</I>.
+<H2>C/C++ Code</H2>
+<H3>Original code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+  int i=0;<BR>
+  double sum = 0;<BR>
+\#pragma omp parallel for reduction(+:sum)<BR>
+  for(i=0;i<100000;i++)<BR>
+  {<BR>
+    sum += 1.0/(omp_get_thread_num()+1);<BR>
+  }<BR>
+  printf("Sum is %f\n", sum);<BR>
+  return 0;<BR>
+}<BR>
+</CODE>
+<H3>Instrumented code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+\#include <likwid.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+  int i=0;<BR>
+  double sum = 0;<BR>
+  LIKWID_MARKER_INIT;<BR>
+\#pragma omp parallel<BR>
+{<BR>
+  LIKWID_MARKER_THREADINIT;<BR>
+}<BR>
+\#pragma omp parallel<BR>
+{<BR>
+  LIKWID_MARKER_START("sum");<BR>
+\#pragma omp for reduction(+:sum)<BR>
+  for(i=0;i<100000;i++)<BR>
+  {<BR>
+    sum += 1.0/(omp_get_thread_num()+1);<BR>
+  }<BR>
+  LIKWID_MARKER_STOP("sum");<BR>
+}<BR>
+  printf("Sum is %f\n", sum);<BR>
+  LIKWID_MARKER_CLOSE;<BR>
+  return 0;<BR>
+}<BR>
+</CODE>
+The LIKWID package contains an example code: see \ref C-markerAPI-code or \ref F-markerAPI-code.
+<H3>Running code</H3>
+With the help of <CODE>likwid-perfctr</CODE> the counters are configured to the selected events. The counters are also started and stopped by <CODE>likwid-perfctr</CODE>, the Marker API only reads the counters to minimize the overhead of the instrumented application. Only if you use <CODE>LIKWID_MARKER_SWITCH</CODE> the Marker API itself configures a new event set to the registers. Basically, <CODE>likwid-perfctr</CODE> exports the whole configuration needed by the Marker API through env [...]
+In order to build your instrumented application:<BR>
+<CODE>$CC -openmp -L<PATH_TO_LIKWID_LIBRARY> -I<PATH_TO_LIKWID_INCLUDES> <SRC_CODE> -o <EXECUTABLE> -llikwid</CODE><BR>
+With standard installation, the paths are <CODE><PATH_TO_LIKWID_LIBRARY>=/usr/local/lib</CODE> and <CODE><PATH_TO_LIKWID_INCLUDES>=/usr/local/include</CODE><BR>
+Example Marker API call:<BR>
+<CODE>likwid-perfctr -C 0-4 -g L3 <B>-m</B> ./a.out</CODE>
+<BR>
+<BR>
+
+<H2>Fortran Code</H2>
+Besides the Marker API for C/C++ programms, LIKWID offers to build a Fortran module to access the Marker API functions from Fortran. Only the Marker API calls are exported, not the whole API. In <CODE>config.mk</CODE> the variable <CODE>FORTRAN_INTERFACE</CODE> must be set to true. LIKWID's default is to use the Intel Fortran compiler to build the interface but it can be modified to use GCC's Fortran compiler in <CODE>make/include_<COMPILER></CODE>.<BR>
+The LIKWID package contains an example code: see \ref F-markerAPI-code.
+
+<H2>Hints for the usage of the Marker API</H2>
+Since the calls to the LIKWID library are executed by your application, the runtime will raise and in specific circumstances, there are some other problems like the time measurement. You can execute <CODE>LIKWID_MARKER_THREADINIT</CODE> and <CODE>LIKWID_MARKER_START</CODE> inside the same parallel region but put a barrier between the calls to ensure that there is no big timing difference between the threads. The common way is to init LIKWID and the participating threads inside of an init [...]
+
+*/
diff --git a/doc/applications/likwid-perfscope.md b/doc/applications/likwid-perfscope.md
new file mode 100644
index 0000000..71c8984
--- /dev/null
+++ b/doc/applications/likwid-perfscope.md
@@ -0,0 +1,107 @@
+/*! \page likwid-perfscope <CODE>likwid-perfscope</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfscope</CODE> is a command line application written in Lua that uses the timeline daemon mode of \ref likwid-perfctr
+to create on-the-fly pictures with the current measurements. It uses the <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> Perl script to send the current data to  <A HREF="http://www.gnuplot.info/">gnuplot</A>. In order to make it more convenient for users, preconfigured plots of interesting metrics are embedded into <CODE>likwid-perfscope</CODE>. Since the plot windows are normally closed directly after the execution of the monitored applications, <CODE>likwid-perfscope</ [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-a, --all</TD>
+  <TD>Print available predefined plot configurations for current processor.</TD>
+</TR>
+<TR>
+  <TD>-d, --dump</TD>
+  <TD>Print measurements to stdout.</TD>
+</TR>
+<TR>
+  <TD>-p, --plotdump</TD>
+  <TD>Use feedGnuplots feature to dump plot configuration and its data to stdout.</TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-C <arg></TD>
+  <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-t, --time <time></TD>
+  <TD>Specify the measurement time for each plot. <time> is handled over to \ref likwid-perfctr with the -t option. <BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-g, --group <arg></TD>
+  <TD>Specify a predefined plot with optional changes or an eventset with plot configuration. See \ref plot_configuration for details.</TD>
+</TR>
+<TR>
+  <TD>-r, --range <arg></TD>
+  <TD>Specify the amount of data points that should be visible in the plots. Often refered to as sliding window.</TD>
+</TR>
+<TR>
+  <TD>--host <arg></TD>
+  <TD>Connect to <arg> via ssh and execute likwid-perfctr and the application there. The plots are created on the local machine. Often used if measured on hosts without X11 or GnuPlot.</TD>
+</TR>
+</TABLE>
+
+\anchor plot_configuration
+<H1>Plot configurations</H1>
+<CODE>likwid-perfscope</CODE> extends the format of the eventset option of \ref likwid-perfctr to make it more conveniet for the users. It accepts either a plot configuration of interesting metrics which are embedded into <CODE>likwid-perfscope</CODE> or a custom eventset suitable for \ref likwid-perfctr extended by the plot configuration. A plot configuration can be set with key=value pairs separated by ':' and has to contain at least a definition of a formula for plotting. If specifyed [...]
+<TABLE>
+<TR>
+  <TH>Option
+                          
+  </TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>title=<string><BR>TITLE=<string></TD>
+  <TD>Use <string> as title for the plot. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD>xtitle=<string><BR>XTITLE=<string></TD>
+  <TD>Use <string> as label for the x-axis. The default label is 'Time'. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD>ytitle=<string><BR>YTITLE=<string></TD>
+  <TD>Use <string> as label for the left y-axis. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD><string>=<string></TD>
+  <TD>Use the first <string> as legend entry and the second <string> as input forumla for the plot. The result is printed over the run time. The names of the specified counters can be used as variables in the formula. Additional variables are 'time' for the measurement time and 'inverseClock' for the inverted clock frequency. No spaces are allowed in the formula.</TD>
+</TR>
+<TR>
+  <TD>y2title=<string><BR>Y2TITLE=<string><BR>y2title=<id-string><BR>Y2TITLE=<id-string></TD>
+  <TD>Use <string> as label for the right y-axis. If <id-string> is given, the formula with id is associated with the y2-axis. If used with predefined plot configurations, be aware that the formula 1 is part of the plot configuration. If no id is given, the y2-axis is associated with the last given formula. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfscope -g L3_BAND -C 0-2 -t 1s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and use the predefined plot configuration <CODE>L3_BAND</CODE> The plot is updated ever second.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND:TITLE="My Title" -C S0:1 -t 500ms ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>L3_BAND</CODE> but change the title for the plot to "My Title".
+</LI>
+<LI><CODE>likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="CPI" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0 and use the custom event set <CODE>INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1</CODE>. The last event set entry specifies custom plot options. The plotted formula is <CODE>FIXC0/FIXC1</CODE> and the plot title and legend entry is set to 'CPI'.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 0 and use the predefined plot configuration  <CODE>L3_BAND</CODE> to measure every 2 seconds. Additionally, a formula <CODE>FIXC0/FIXC1</CODE> with the name <CODE>CPI</CODE> is given. The right y-axis is associated to the given function and labeled with <CODE>Cycles per Instruction</CODE>. The formula ID 2 is not needed in this case as the default behavior is to associate the right y-axis to the last formula given.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-pin.md b/doc/applications/likwid-pin.md
new file mode 100644
index 0000000..b8c8a1e
--- /dev/null
+++ b/doc/applications/likwid-pin.md
@@ -0,0 +1,170 @@
+/*! \page likwid-pin <CODE>likwid-pin</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-pin</CODE> is a command line application to pin a sequential or multithreaded application to dedicated processors. It can be used as replacement for taskset.
+Opposite to taskset no affinity mask but single processors are specified. For multithreaded applications based on the <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> library the <CODE>pthread_create</CODE> library call is overloaded through <CODE>LD_PRELOAD</CODE> and each created thread is pinned to a dedicated processor as specified in the pinning list. Per default every generated thread is pinned to the core in the order of calls to <CODE>pthread_cre [...]
+<BR>
+For OpenMP implementations, GCC and ICC compilers are explicitly supported. Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library. Others may also work.<BR>
+<BR>
+<CODE>likwid-pin</CODE> sets the environment variable <CODE>OMP_NUM_THREADS</CODE> for you if not already present. It will set as many threads as present in the pin expression.  Be aware that with <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> the parent thread is always pinned. If you create for example 4 threads with <CODE>pthread_create</CODE> and do not use the parent process as worker you still have to provide <CODE>num_threads + 1</CODE> processo [...]
+<BR>
+<CODE>likwid-pin</CODE> supports different numberings for pinning. Per default physical numbering of the cores is used. This is the numbering also \ref likwid-topology reports. But also logical numbering inside the node or the sockets can be used. For details look at \ref CPU_expressions. <!--If using with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node. Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads with -c N:0-7 you get all physical c [...]
+
+For applications where first touch policy on NUMA systems cannot be employed <CODE>likwid-pin</CODE> can be used to turn on interleave memory placement. This can significantly speed up the performance of memory bound multi threaded codes. All NUMA nodes the user pinned threads to are used for interleaving.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Define the CPUs that the application should be pinned on. LIKWID provides an intuitive and feature-rich syntax for CPU expressions.<BR>See section \ref CPU_expressions for details.</TD>
+</TR>
+<TR>
+  <TD>-S, --sweep</TD>
+  <TD>Sweep memory and clean LLC of NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+  <TD>-i</TD>
+  <TD>Activate interleaved memory policy for NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print the thread affinity domains. If -c is set on the commandline, the affinity domains filled only with the given CPUs are printed.</TD>
+</TR>
+<TR>
+  <TD>-q, --quiet</TD>
+  <TD>Don't print infos of the pinning process</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+  <TD>-d</TD>
+  <TD>Set the delimiter for the output of -p. Default is ','</TD>
+</TR>
+</TABLE>
+
+\anchor thread_affinity_domains
+<H1>Affinity Domains</H1>
+While gathering the system topology, LIKWID groups the CPUs into so-called thread affinity domains. A thread affinity domain is a group of CPU IDs that are related to some kind of central entity of the system. The most common domain is the node domain (<CODE>N</CODE>) that contains all CPUs available in the system. Other domains group the CPUs according to socket, LLC or NUMA node relation. <CODE>likwid-pin</CODE> prints out all available affinity domains with the commandline option <COD [...]
+<TABLE>
+<TR>
+  <TH>Domain name</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD><CODE>N</CODE></TD>
+  <TD>Includes all CPUs in the system</TD>
+</TR>
+<TR>
+  <TD><CODE>S<number></CODE></TD>
+  <TD>Includes all CPUs that reside on CPU socket x</TD>
+</TR>
+<TR>
+  <TD><CODE>C<number></CODE></TD>
+  <TD>Includes all CPUs that share the same LLC with ID <CODE><number></CODE>.<BR>This domain often contains the same CPUs as the <CODE>S<number></CODE> domain because many CPU socket have a LLC shared by all CPUs of the socket</TD>
+</TR>
+<TR>
+  <TD><CODE>M<number></CODE></TD>
+  <TD>Includes all CPUs that are attached to the same NUMA memory domain</TD>
+</TR>
+</TABLE>
+
+\anchor CPU_expressions
+<H1>CPU expressions</H1>
+One outstanding feature of LIKWID are the CPU expressions which are resolved to the CPUs in the actual system. There are multiple formats that can be chosen where each offers a convenient way to select the desired CPUs for execution or measurement. The CPU expressions are used for <CODE>likwid-pin</CODE> as well as \ref likwid-perfctr. This section introduces the 4 formats and gives examples.
+
+<H3>Physical numbering:</H3>
+The first and probably most natural way of defining a list of CPUs is the usage of the physical numbering, similar to the numbering of the operating system and the IDs printed by \ref likwid-topology. The desired CPU IDs can be set as comma-separated list, as range or a combination of both.
+<UL>
+<LI><CODE>-c 1</CODE><BR>
+Run only on CPU with ID 1
+</LI>
+<LI><CODE>-c 1,4</CODE><BR>
+Run on CPUs with ID 1 and 4
+</LI>
+<LI><CODE>-c 1-3</CODE><BR>
+Run on CPUs ranging from ID 1 to ID 3, hence CPUs 1,2,3
+</LI>
+<LI><CODE>-c 0,1-3</CODE><BR>
+Run on CPU with ID 0 and the CPU range starting from ID 1 to ID3, hence 0,1,2,3
+</LI>
+</UL>
+<H3>Logical numbering:</H3>
+Besides the enumeration of physical CPU IDs, LIKWID supports the logical numbering inside of an affinity domain. For logical selection, the indicies inside of the desired affinity domain has to be given on the commandline. The logical numbering can be selected by prefixing the cpu expression with <CODE>L:</CODE>. The format is <CODE>L:<indices></CODE> assuming affinity domain <CODE>N</CODE> or <CODE>L:<affinity domain>:<indices></CODE>. Moreover, it is automatically act [...]
+<UL>
+<LI><CODE>-c L:0</CODE><BR>
+Run only on CPU 0, the first entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:0,4</CODE><BR>
+Run on the first and fifth entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:1-3</CODE><BR>
+Run on CPUs ranging from index 1 to index 3 in the <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,2,3.
+</LI>
+<LI><CODE>-c L:N:1,4-6</CODE><BR>
+Run on CPUs with index 1 and the range of indices from 4 to 6 in given <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,4,5,6.
+</LI>
+</UL>
+<H3>Numbering by expression:</H3>
+The most powerful format is probably the expression format. The format combines the input values for a selection function in a convenient way. In order to activate the expression format, the CPU string must be prefixed with <CODE>E:</CODE>. The basic format is <CODE>E:<affinity domain>:<numberOfThreads></CODE> which selects simply the given <CODE><numberOfThreads></CODE> in the supplied <CODE><affinity domain></CODE>. The extended format is <CODE>E:<affinity do [...]
+<UL>
+<LI><CODE>-c E:N:1</CODE><BR>
+Selects the first entry in the node affinity domain, thus CPU 0
+</LI>
+<LI><CODE>-c E:N:2</CODE><BR>
+Selects the first two entries in the node affinity domain, thus CPUs 0 and 4
+</LI>
+<LI><CODE>-c E:N:2:1:2</CODE><BR>
+Selects 1 CPU in a row and skips 1 entries thus we get CPUs 0 and 1
+</LI>
+<LI><CODE>-c E:N:4:2:4</CODE><BR>
+Selects in total 4 CPUs, 2 in a row with a stride of 4, thus CPUs 0,4,2,6
+</LI>
+</UL>
+<H3>Scatter expression:</H3>
+The scatter expression distributes the threads evenly over the desired affinity domains. In contrast to the previous selection methods, the scatter expression schedules threads over multiple affinity domains. Although you can also select <CODE>N</CODE> as scatter domain, the intended domains are <CODE>S</CODE>, <CODE>C</CODE> and <CODE>M</CODE>. The scattering selects physical cores first. For the examples we assume that the socket affinity domain looks like this: <CODE>S0 = 0,4,1,5</COD [...]
+<UL>
+<LI><CODE>-c S:scatter</CODE><BR>
+The resulting CPU list is 0,2,1,3,4,6,5,7
+</LI>
+<LI><CODE>-c M:scatter</CODE><BR>
+Scatter the threads evenly over all NUMA memory domains. A kind of interleaved thread policy.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-powermeter.md b/doc/applications/likwid-powermeter.md
new file mode 100644
index 0000000..489689d
--- /dev/null
+++ b/doc/applications/likwid-powermeter.md
@@ -0,0 +1,75 @@
+/*! \page likwid-powermeter <CODE>likwid-powermeter</CODE>
+
+<H1>Information</H1>
+likwid-powermeter is a command line application to get the energy comsumption on Intel RAPL capable processors. Currently
+all Intel CPUs starting with Intel SandyBridge are supported. It also prints information about TDP and Turbo Mode steps supported.
+The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete run. RAPL works on a per package (socket) base.
+Please note that the RAPL counters are also accessible as normal events withing \ref likwid-perfctr.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Specify sockets to measure</TD>
+</TR>
+<TR>
+  <TD>-M <0|1></TD>
+  <TD>Set access mode to access MSRs. 0=direct, 1=accessDaemon</TD>
+</TR>
+<TR>
+  <TD>-s <time></TD>
+  <TD>Set measure duration in us, ms or s. (default 2s)</TD>
+</TR>
+<TR>
+  <TD>-i, --info</TD>
+  <TD>Print information from <CODE>MSR_*_POWER_INFO</CODE> register and Turbo mode</TD>
+</TR>
+<TR>
+  <TD>-t</TD>
+  <TD>Print current temperatures of all CPU cores</TD>
+</TR>
+<TR>
+  <TD>-f</TD>
+  <TD>Print current temperatures of all CPU cores in Fahrenheit</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print dynamic clocking and CPI values, uses \ref likwid-perfctr</TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/applications/likwid-setFreq.md b/doc/applications/likwid-setFreq.md
new file mode 100644
index 0000000..0db59e6
--- /dev/null
+++ b/doc/applications/likwid-setFreq.md
@@ -0,0 +1,13 @@
+/*! \page likwid-setFreq <CODE>likwid-setFreq</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFreq</CODE> is a command line application that mediates the actual setting of CPU cores' frequency and governor for \ref likwid-setFrequencies. Since only users with root priviledges are allowed to change the frequency of CPU cores, <CODE>likwid-setFreq</CODE> needs to be suid-root.
+
+<H1>Setup</H1>
+Setting the suid-root bit:<BR>
+<CODE>
+root: # chown root:root likwid-setFreq<BR>
+root: # chmod u+s likwid-setFreq
+</CODE>
+
+*/
diff --git a/doc/applications/likwid-setFrequencies.md b/doc/applications/likwid-setFrequencies.md
new file mode 100644
index 0000000..e753a9e
--- /dev/null
+++ b/doc/applications/likwid-setFrequencies.md
@@ -0,0 +1,50 @@
+/*! \page likwid-setFrequencies <CODE>likwid-setFrequencies</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFrequencies</CODE> is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+\ref likwid-setFreq . The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With <CODE>likwid-setFrequencies</CODE> the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-l</TD>
+  <TD>Print all configurable frequencies.</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print the current frequencies for all CPU cores.</TD>
+</TR>
+<TR>
+  <TD>-m</TD>
+  <TD>Print all configurable governors./TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Define the CPUs that should be modified. For information about the syntax see \ref CPU_expressions on the \ref likwid-pin page.</TD>
+</TR>
+<TR>
+  <TD>-f, --freq <arg></TD>
+  <TD>Specify the frequency for the selected CPUs.</TD>
+</TR>
+<TR>
+  <TD>-g <arg></TD>
+  <TD>Specify the governor for the selected CPUs.</TD>
+</TR>
+</TABLE>
+
+<H1>Notice</H1>
+Shortly before releasing the first version of LIKWID 4, the CPU frequency module and its behavior have changed compared to the previous <B>cpufreq</B> module. It is not possible anymore to set the CPU clock to a fixed frequency, you can only define a performance level called P-State. Inside that level, the CPU can vary its clock frequency. <CODE>likwid-setFrequencies</CODE> and its daemon \ref likwid-setFreq do not have support for the new kernel module <B>intel_pstate</B>. Therefore, th [...]
+
+*/
diff --git a/doc/applications/likwid-topology.md b/doc/applications/likwid-topology.md
new file mode 100644
index 0000000..f57a045
--- /dev/null
+++ b/doc/applications/likwid-topology.md
@@ -0,0 +1,68 @@
+/*! \page likwid-topology <CODE>likwid-topology</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-topology</CODE> is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ASCII art. Beyond topology <CODE>likwid-topology</CODE> determines the nominal clock of a processor and prints detailed informations about the caches hierarchy.<BR>
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c, --caches</TD>
+  <TD>Print detailed information about all cache levels</TD>
+</TR>
+<TR>
+  <TD>-C, --clock</TD>
+  <TD>Measure the nominal clock frequency and print it</TD>
+</TR>
+<TR>
+  <TD>-g</TD>
+  <TD>ASCII art output of the system's topology</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>).</TD>
+</TR>
+<TR>
+  <TD>-o, --output <file></TD>
+  <TD>Write the output to file <file> instead of stdout. According to the used filename suffix, LIKWID tries to reformat the output to the specified format.<BR>By now, LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>If <CODE>\%h</CODE> is in the filename, it is replaced by the host name.</TD>
+</TR>
+</TABLE>
+
+
+
+*/
diff --git a/doc/archs/atom.md b/doc/archs/atom.md
new file mode 100644
index 0000000..58a506c
--- /dev/null
+++ b/doc/archs/atom.md
@@ -0,0 +1,104 @@
+/*! \page atom Intel® Atom
+
+<P>The Intel® Atom performance monitoring counters are equal to the ones of the Intel® Core 2 microarchitecture but the event set is different.</P>
+<H1>Available performance monitors for the Intel® Atom microarchitecture</H1>
+<UL>
+<LI>\ref ATOM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref ATOM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor ATOM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Core2/Atom microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor ATOM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Atom microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/broadwell.md b/doc/archs/broadwell.md
new file mode 100644
index 0000000..ff207af
--- /dev/null
+++ b/doc/archs/broadwell.md
@@ -0,0 +1,203 @@
+/*! \page broadwell Intel® Broadwell
+
+<P>This page is valid for Broadwell, Broadwell single socket server (Xeon D) and Broadwell EP/EN/EX. No Uncore support by now, no documentation is available for the Uncore counters of Broadwell</P>
+
+<H1>Available performance monitors for the Intel® Broadwell microarchitecture</H1>
+<UL>
+<LI>\ref BRD_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BRD_PMC "General-purpose counters"</LI>
+<LI>\ref BRD_THERMAL "Thermal counters"</LI>
+<LI>\ref BRD_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor BRD_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BRD_PMC
+<H2>General-purpose counters</H2>
+<P>Commonly the Intel® Broadwell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Broadwell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor BRD_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor BRD_POWER
+<H2>Power counter</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/core2.md b/doc/archs/core2.md
new file mode 100644
index 0000000..679da04
--- /dev/null
+++ b/doc/archs/core2.md
@@ -0,0 +1,103 @@
+/*! \page core2 Intel® Core2
+
+<H1>Available performance monitors for the Intel® Core2 microarchitecture</H1>
+<UL>
+<LI>\ref FIXED "Fixed-purpose counters"</LI>
+<LI>\ref PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Intel Core2 microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Core2 microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/haswell.md b/doc/archs/haswell.md
new file mode 100644
index 0000000..65836bd
--- /dev/null
+++ b/doc/archs/haswell.md
@@ -0,0 +1,203 @@
+/*! \page haswell Intel® Haswell
+
+<H1>Available performance monitors for the Intel® Haswell microarchitecture</H1>
+<UL>
+<LI>\ref HAS_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HAS_PMC "General-purpose counters"</LI>
+<LI>\ref HAS_THERMAL "Thermal counters"</LI>
+<LI>\ref HAS_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HAS_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HAS_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the OFF [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8077 and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor HAS_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HAS_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
+
+
diff --git a/doc/archs/haswellep.md b/doc/archs/haswellep.md
new file mode 100644
index 0000000..9368c54
--- /dev/null
+++ b/doc/archs/haswellep.md
@@ -0,0 +1,896 @@
+/*! \page haswellep Intel® Haswell EP/EN/EX
+
+
+<H1>Available performance monitors for the Intel® Haswell EP/EN/EX microarchitecture</H1>
+<UL>
+<LI>\ref HASEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HASEP_PMC "General-purpose counters"</LI>
+<LI>\ref HASEP_THERMAL "Thermal counters"</LI>
+<LI>\ref HASEP_POWER "Power measurement counters"</LI>
+<LI>\ref HASEP_BBOX "Home Agent counters"</LI>
+<LI>\ref HASEP_SBOX "Ring transfer counters"</LI>
+<LI>\ref HASEP_QBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref HASEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref HASEP_UBOX "Uncore management counters"</LI>
+<LI>\ref HASEP_WBOX "Power control unit counters"</LI>
+<LI>\ref HASEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref HASEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref HASEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref HASEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HASEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied wit [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can programm it from user-space, the results are always 0.</P>
+
+\anchor HASEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HASEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+
+\anchor HASEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
+</I><BR>
+The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. For systems where each socket has 12 or more cores, there are both HAs available. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_SBOX
+<H2>Ring-to-Ring interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture manages the socket internal traffic through ring-based networks. Depending on the system's configuration there are multiple rings in one socket. The SBOXes organizes the traffic between the rings. The description from Intel®:<BR>
+<I>The SBox manages the interface between the two Rings.<BR>
+The processor is composed of two independent rings connected via two sets of bi-directional buffered switches. Each set of bi-directional buffered switches is partitioned into two ingress/egress pairs. Further, each ingress/egress pair is associated with a ring stop on adjacent rings. This ring stop is termed an Sbo. The processor has up to 4 SBos depending on SKU. The Sbo can be simply thought of as a conduit for the ring, but must also help maintain ordering of traffic to ensure functi [...]
+</I><BR>
+The SBOX hardware performance counters are exposed to the operating system through the MSR interface. There are maximal four of those interfaces but not all must be present. The name SBOX originates from the Nehalem EX Uncore monitoring where the functional unit to the QPI network is called SBOX but it had a different duty..
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>N</TD>
+  <TD>Set bit 19 in config register</TD>
+  <TD>This option has no real effect because TID filtering can be activated but there is no possibility to specify the TID somewhere.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_QBOX
+<H2>QPI interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Intel® Xeon processor  [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. The actual amount of QBOX counters depend on the CPU core count of one socket. If your system has not all interfaces but interface 0 does not work, try the other ones. The QBOX was introduced for the Haswell EP microarchitecture, for older Uncore-aware architectures the QBOX and the SBOX are the same.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX0</TD>
+  <TD>QPI_RATE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX1</TD>
+  <TD>QPI_RX_IDLE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX2</TD>
+  <TD>QPI_RX_LLR</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for QBOX<0,1>C<0,1,2,3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_0 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_1 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_0 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_1 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MASK_0 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MASK_1 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MASK_0 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MASK_1 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery
+from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® Haswell EP/EN/EX microarchitecture is 17. E7-8800 v2 systems have all 17 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitorin [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 0-4 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 17-22 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+  <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>See the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for more information.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor HASEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across Intel® Xeon processor E5 v3 family using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the Intel® Xeon processor E5 v3 family. Intel® Xeon processor E5 v3 family uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_IBOX
+<H2>IRP box counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I>
+
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® Xeon processor E5 v3 family integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the IMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_RBOX
+<H2>Ring-to-QPI counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The Ring-to-QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring where those functional units are called RBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PBOX
+<H2>Ring-to-PCIe counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/interlagos.md b/doc/archs/interlagos.md
new file mode 100644
index 0000000..cec7734
--- /dev/null
+++ b/doc/archs/interlagos.md
@@ -0,0 +1,107 @@
+/*! \page interlagos AMD® Interlagos
+
+<H1>Available performance monitors for the AMD® Interlagos microarchitecture</H1>
+<UL>
+<LI>\ref ILG_PMC "General-purpose counters"</LI>
+<LI>\ref ILG_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor ILG_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 6 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x1F</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor ILG_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/ivybridge.md b/doc/archs/ivybridge.md
new file mode 100644
index 0000000..3008475
--- /dev/null
+++ b/doc/archs/ivybridge.md
@@ -0,0 +1,190 @@
+/*! \page ivybridge Intel® IvyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVB_PMC "General-purpose counters"</LI>
+<LI>\ref IVB_THERMAL "Thermal counters"</LI>
+<LI>\ref IVB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the  [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVB_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+*/
+
+
diff --git a/doc/archs/ivybridgeep.md b/doc/archs/ivybridgeep.md
new file mode 100644
index 0000000..09f0bcd
--- /dev/null
+++ b/doc/archs/ivybridgeep.md
@@ -0,0 +1,790 @@
+/*! \page ivybridgeep Intel® IvyBridge EP/EN/EX
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVBEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVBEP_PMC "General-purpose counters"</LI>
+<LI>\ref IVBEP_THERMAL "Thermal counters"</LI>
+<LI>\ref IVBEP_POWER "Power measurement counters"</LI>
+<LI>\ref IVBEP_BBOX "Home Agent counters"</LI>
+<LI>\ref IVBEP_SBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref IVBEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref IVBEP_UBOX "Uncore management counters"</LI>
+<LI>\ref IVBEP_WBOX "Power control unit counters"</LI>
+<LI>\ref IVBEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref IVBEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref IVBEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref IVBEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVBEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge EP/EN/EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can b [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+
+\anchor IVBEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA but only for the E7-8800 v2 both are available. The name BBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_SBOX
+<H2>LLC-to-QPI interface counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Ivy Bridge, Intel® QPI [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>FIX</TD>
+  <TD>QPI_RATE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for SBOX<0-2>C<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+  <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+  <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_CBOX
+<H2>CBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC;
+generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® IvyBridge EP/EN/EX microarchitecture is 15. E7-8800 v2 systems have all 15 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 0-4 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD>A description of filter capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 17-22 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+
+\anchor IVBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>
+The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across physical processor using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package. The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_IBOX
+<H2>IBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I><BR>
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_MBOX
+<H2>MBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) are named M [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/k10.md b/doc/archs/k10.md
new file mode 100644
index 0000000..a5ab582
--- /dev/null
+++ b/doc/archs/k10.md
@@ -0,0 +1,68 @@
+/*! \page k10 AMD® K10
+
+<H1>Available performance monitors for the AMD® K10 microarchitecture</H1>
+<UL>
+<LI>\ref K10_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K10_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K10 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/k8.md b/doc/archs/k8.md
new file mode 100644
index 0000000..5bcdcce
--- /dev/null
+++ b/doc/archs/k8.md
@@ -0,0 +1,68 @@
+/*! \page k8 AMD® K8
+
+<H1>Available performance monitors for the AMD® K8 microarchitecture</H1>
+<UL>
+<LI>\ref K8_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K8_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K8 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/kabini.md b/doc/archs/kabini.md
new file mode 100644
index 0000000..41824cc
--- /dev/null
+++ b/doc/archs/kabini.md
@@ -0,0 +1,162 @@
+/*! \page kabini AMD® Kabini
+
+<H1>Available performance monitors for the AMD® Kabini microarchitecture</H1>
+<UL>
+<LI>\ref KAB_PMC "General-purpose counters"</LI>
+<LI>\ref KAB_CPMC "L2 cache general-purpose counters"</LI>
+<LI>\ref KAB_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor KAB_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+<H1>Counters available for one hardware thread per shared L2 cache</H1>
+\anchor KAB_CPMC
+<H2>L2 general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for measuring L2 cache events. They consist of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>4 bit hex value</TD>
+  <TD>Set bits 56-59 in config register</TD>
+  <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>4 bit hex value</TD>
+  <TD>Set bits 48-51 in config register</TD>
+  <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor KAB_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/nehalem.md b/doc/archs/nehalem.md
new file mode 100644
index 0000000..b2d45b8
--- /dev/null
+++ b/doc/archs/nehalem.md
@@ -0,0 +1,237 @@
+/*! \page nehalem Intel® Nehalem
+
+<H1>Available performance monitors for the Intel® Nehalem microarchitecture</H1>
+<UL>
+<LI>\ref NEH_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEH_PMC "General-purpose counters"</LI>
+<LI>\ref NEH_UNCORE "General-purpose counters for the Uncore"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEH_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEH_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem microarchitecture has one of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEH_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 8 general-purpose counters consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the Uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC7</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMCFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>Documented but register only available in Westmere architecture. A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>40 bit physical memory address</TD>
+  <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>Documented but register only available in Westmere architecture. </TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/nehalemex.md b/doc/archs/nehalemex.md
new file mode 100644
index 0000000..8bbb735
--- /dev/null
+++ b/doc/archs/nehalemex.md
@@ -0,0 +1,554 @@
+/*! \page nehalemex Intel® Nehalem EX
+
+<H1>Available performance monitors for the Intel® Nehalem EX microarchitecture</H1>
+<UL>
+<LI>\ref NEHEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEHEX_PMC "General-purpose counters"</LI>
+<LI>\ref NEHEX_MBOX "Memory controller counters"</LI>
+<LI>\ref NEHEX_BBOX "Home Agent counters"</LI>
+<LI>\ref NEHEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref NEHEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref NEHEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref NEHEX_WBOX "Power control unit counters"</LI>
+<LI>\ref NEHEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEHEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem EX microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEHEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the memory controllers in the Uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling,  [...]
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>34 bit address</TD>
+  <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+  <TD></TD>
+</TR>
+</TABLE><BR>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor NEHEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the Home Agent in the Uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Nehalem EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the crossbar router in the Uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router. Each RBOX offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The RBOX setup routine is taken from Likwid 3.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C7</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 8 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC-to-QPI interface in the Uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>64 bit hex value</TD>
+  <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>39 bit hex value</TD>
+  <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the power controller in the Uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOXFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the system configuration controller in the Uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/archs/pentiumm.md b/doc/archs/pentiumm.md
new file mode 100644
index 0000000..8ebc46d
--- /dev/null
+++ b/doc/archs/pentiumm.md
@@ -0,0 +1,63 @@
+/*! \page pentiumm Intel® Pentium M
+
+<H1>Available performance monitors for the Intel® Pentium M microarchitecture</H1>
+<UL>
+<LI>\ref PM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PM_PMC
+<H2>PMC counters</H2>
+The Intel® Pentium M microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/phi.md b/doc/archs/phi.md
new file mode 100644
index 0000000..ac256c8
--- /dev/null
+++ b/doc/archs/phi.md
@@ -0,0 +1,78 @@
+/*! \page phi Intel® Xeon Phi
+
+<P>To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
+RDTSC being used for wallclock time. On the MIC this is only given if power
+management is turned off. This can be configured in
+<CODE>/etc/sysconfig/mic/default.conf</CODE>.<BR>
+
+At the end of this file the power management is configured. The following
+configuration worked:<BR>
+<CODE>PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"</CODE>
+</P>
+
+<H1>Available performance monitors for the Intel® Xeon Phi microarchitecture</H1>
+<UL>
+<LI>\ref PHI_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PHI_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Xeon Phi microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/sandybridge.md b/doc/archs/sandybridge.md
new file mode 100644
index 0000000..385a724
--- /dev/null
+++ b/doc/archs/sandybridge.md
@@ -0,0 +1,189 @@
+/*! \page sandybridge Intel® SandyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SNB_PMC "General-purpose counters"</LI>
+<LI>\ref SNB_THERMAL "Thermal counters"</LI>
+<LI>\ref SNB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNB_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® SandyBridge systems</P>
+*/
diff --git a/doc/archs/sandybridgeep.md b/doc/archs/sandybridgeep.md
new file mode 100644
index 0000000..ce98c8a
--- /dev/null
+++ b/doc/archs/sandybridgeep.md
@@ -0,0 +1,775 @@
+/*! \page sandybridgeep Intel® SandyBridge EP/EN
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNBEP_FIXED Fixed-purpose counters</LI>
+<LI>\ref SNBEP_PMC General-purpose counters</LI>
+<LI>\ref SNBEP_THERMAL Thermal counters</LI>
+<LI>\ref SNBEP_POWER Power measurement counters</LI>
+<LI>\ref SNBEP_MBOX Integrated memory controller counters</LI>
+<LI>\ref SNBEP_CBOX Last Level cache counters</LI>
+<LI>\ref SNBEP_UBOX Uncore management counters</LI>
+<LI>\ref SNBEP_SBOX Intel® QPI Link Layer counters</LI>
+<LI>\ref SNBEP_BBOX Home Agent counters</LI>
+<LI>\ref SNBEP_WBOX Power control unit counters</LI>
+<LI>\ref SNBEP_RBOX Ring-to-QPI interface counters</LI>
+<LI>\ref SNBEP_PBOX Ring-to-PCIe interface counters</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNBEP_FIXED
+<H2>Fixed counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PMC
+<H2>PMC counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. All SandyBridge based systems have one memory controller. There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The name MBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-3>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the
+LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The Last Level cache performance counters are exposed to the operating system through the MSR interface. SandyBridge EN/EP systems have maximal 8 CBOXes, each with 4 general-purpose counters. The name CBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit opcode identifier, see uncore performance monitoring guide for SandyBridge</TD>
+  <TD>Set bits 23-31 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>LIKWID checks whether the given value is a valid opcode. A list of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>5 bit state representation</TD>
+  <TD>Set bits 18-22 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>F: 0x10,<BR>M: 0x08,<BR>E: 0x04,<BR>S: 0x02,<BR>I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>8 bit node ID</TD>
+  <TD>Set bits 10-17 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>Note that for Node ID 0 the hex value should be 0x01.</TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit thread ID value</TD>
+  <TD>Set bits 0-4 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>Bit 0 means physical or logical thread, bits 1-3 the core ID</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor SNBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the management box in the uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller for the Intel® Xeon Processor E5-2600 family uncore.<BR>
+In this capacity, the UBox acts as the central unit for a variety of functions:<BR>
+<UL>
+<LI>The master for reading and writing physically distributed registers across the uncore using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the sytem and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter UBOX<0,1>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_SBOX
+<H2>Intel® QPI Link Layer counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the QPI Link layer (QPI) in the uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring
+messages to Intel® QPI packets and vice versa.<BR>
+The Intel® QPI is split into two separate layers. The Intel® QPI LL (link layer) is responsible for generating, transmitting, and receiving packets with the Intel®® QPI link.
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>FIX</TD>
+  <TD>QPI_RATE, QPI_SLOW_MODE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter SBOX<0,1>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_0</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_1</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_0</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_1</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_BBOX
+<H2>BBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Home Agent (HA) in the uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel®® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).<BR>
+In other words, it is the coherency agent responsible for guarding the memory controller. All requests for memory attached to the coupled iMC must first be ordered through the HA.
+</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. The name BBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_WBOX
+<H2>WBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the power control unit (PCU) in the uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package.<BR>
+The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7,<BR>Band1: bits 8-15,<BR>Band2: bits 16-23,<BR>Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores<BR>in C0: 0x1,<BR>in C3: 0x2,<BR>in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface. The name PBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/silvermont.md b/doc/archs/silvermont.md
new file mode 100644
index 0000000..af22e32
--- /dev/null
+++ b/doc/archs/silvermont.md
@@ -0,0 +1,175 @@
+/*! \page silvermont Intel® Silvermont/Airmont
+
+<H1>Available performance monitors for the Intel® Silvermont microarchitecture</H1>
+<UL>
+<LI>\ref SVM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SVM_PMC "General-purpose counters"</LI>
+<LI>\ref SVM_THERMAL "Thermal counters"</LI>
+<LI>\ref SVM_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SVM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SVM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Silvermont microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Silvermont microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Silvermont microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with th [...]
+</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0xFFFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-38 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SVM_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Silvermont microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SVM_POWER
+<H2>Power counters</H2>
+<P>The Intel® Silvermont microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3*</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 and PWR3 counter is commonly not implemented by Intel® Silvermont systems.</P>
+*/
diff --git a/doc/archs/westmere.md b/doc/archs/westmere.md
new file mode 100644
index 0000000..3371c20
--- /dev/null
+++ b/doc/archs/westmere.md
@@ -0,0 +1,239 @@
+/*! \page westmere Intel® Westmere
+
+<P>The Intel® Westmere microarchitecture has the same features as the Intel® Nehalem architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the Uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere microarchitecture</H1>
+<UL>
+<LI>\ref WES_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WES_PMC "General-purpose counters"</LI>
+<LI>\ref WES_UNCORE "Uncore counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WES_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WES_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS and OFFCORE_RESPONSE_1_OPTIONS events. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WES_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Westmere microarchitecture provides 8 general-purpose counters for the uncpre consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC7</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMCFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>40 bit physical memory address</TD>
+  <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/westmereex.md b/doc/archs/westmereex.md
new file mode 100644
index 0000000..ce37674
--- /dev/null
+++ b/doc/archs/westmereex.md
@@ -0,0 +1,555 @@
+/*! \page westmereex Intel® Westmere EX
+
+<P>The Intel® Westmere EX microarchitecture has the same features as the Intel® Westmere architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere EX microarchitecture</H1>
+<UL>
+<LI>\ref WESEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WESEX_PMC "General-purpose counters"</LI>
+<LI>\ref WESEX_MBOX "Memory controller counters"</LI>
+<LI>\ref WESEX_BBOX "Home Agent counters"</LI>
+<LI>\ref WESEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref WESEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref WESEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref WESEX_WBOX "Power control unit counters"</LI>
+<LI>\ref WESEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WESEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register. They are core-local, hence each hardware thread has its own set of general-purpose counters.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with  [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WESEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the memory controllers in the uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling,  [...]
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>34 bit address</TD>
+  <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor WESEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the Home Agent in the uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Westmere EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+  <TD>match0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the crossbar router in the uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Westmere EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router, RBOX0 is the left part and RBOX1 is the right part of the single RBOX. Each RBOX side offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious f [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C7</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 10 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC-to-QPI interface in the uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>64 bit hex value</TD>
+  <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>39 bit hex value</TD>
+  <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the power controller in the uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOXFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the system configuration controller in the uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/bstrlib.txt b/doc/bstrlib.txt
new file mode 100644
index 0000000..d0f02f7
--- /dev/null
+++ b/doc/bstrlib.txt
@@ -0,0 +1,3201 @@
+Better String library
+---------------------
+
+by Paul Hsieh
+
+The bstring library is an attempt to provide improved string processing
+functionality to the C and C++ language.  At the heart of the bstring library
+(Bstrlib for short) is the management of "bstring"s which are a significant
+improvement over '\0' terminated char buffers.
+
+===============================================================================
+
+Motivation
+----------
+
+The standard C string library has serious problems:
+
+    1) Its use of '\0' to denote the end of the string means knowing a
+       string's length is O(n) when it could be O(1).
+    2) It imposes an interpretation for the character value '\0'.
+    3) gets() always exposes the application to a buffer overflow.
+    4) strtok() modifies the string its parsing and thus may not be usable in
+       programs which are re-entrant or multithreaded.
+    5) fgets has the unusual semantic of ignoring '\0's that occur before
+       '\n's are consumed.
+    6) There is no memory management, and actions performed such as strcpy,
+       strcat and sprintf are common places for buffer overflows.
+    7) strncpy() doesn't '\0' terminate the destination in some cases.
+    8) Passing NULL to C library string functions causes an undefined NULL
+       pointer access.
+    9) Parameter aliasing (overlapping, or self-referencing parameters)
+       within most C library functions has undefined behavior.
+   10) Many C library string function calls take integer parameters with
+       restricted legal ranges.  Parameters passed outside these ranges are
+       not typically detected and cause undefined behavior.
+
+So the desire is to create an alternative string library that does not suffer
+from the above problems and adds in the following functionality:
+
+    1) Incorporate string functionality seen from other languages.
+        a) MID$() - from BASIC
+        b) split()/join() - from Python
+        c) string/char x n - from Perl
+    2) Implement analogs to functions that combine stream IO and char buffers
+       without creating a dependency on stream IO functionality.
+    3) Implement the basic text editor-style functions insert, delete, find,
+       and replace.
+    4) Implement reference based sub-string access (as a generalization of
+       pointer arithmetic.)
+    5) Implement runtime write protection for strings.
+
+There is also a desire to avoid "API-bloat".  So functionality that can be
+implemented trivially in other functionality is omitted.  So there is no
+left$() or right$() or reverse() or anything like that as part of the core
+functionality.
+
+Explaining Bstrings
+-------------------
+
+A bstring is basically a header which wraps a pointer to a char buffer.  Lets
+start with the declaration of a struct tagbstring:
+
+    struct tagbstring {
+        int mlen;
+        int slen;
+        unsigned char * data;
+    };
+
+This definition is considered exposed, not opaque (though it is neither
+necessary nor recommended that low level maintenance of bstrings be performed
+whenever the abstract interfaces are sufficient).  The mlen field (usually)
+describes a lower bound for the memory allocated for the data field.  The
+slen field describes the exact length for the bstring.  The data field is a
+single contiguous buffer of unsigned chars.  Note that the existence of a '\0'
+character in the unsigned char buffer pointed to by the data field does not
+necessarily denote the end of the bstring.
+
+To be a well formed modifiable bstring the mlen field must be at least the
+length of the slen field, and slen must be non-negative.  Furthermore, the
+data field must point to a valid buffer in which access to the first mlen
+characters has been acquired.  So the minimal check for correctness is:
+
+    (slen >= 0 && mlen >= slen && data != NULL)
+
+bstrings returned by bstring functions can be assumed to be either NULL or
+satisfy the above property.  (When bstrings are only readable, the mlen >=
+slen restriction is not required; this is discussed later in this section.)
+A bstring itself is just a pointer to a struct tagbstring:
+
+    typedef struct tagbstring * bstring;
+
+Note that use of the prefix "tag" in struct tagbstring is required to work
+around the inconsistency between C and C++'s struct namespace usage.  This
+definition is also considered exposed.
+
+Bstrlib basically manages bstrings allocated as a header and an associated
+data-buffer.  Since the implementation is exposed, they can also be
+constructed manually.  Functions which mutate bstrings assume that the header
+and data buffer have been malloced; the bstring library may perform free() or
+realloc() on both the header and data buffer of any bstring parameter.
+Functions which return bstring's create new bstrings.  The string memory is
+freed by a bdestroy() call (or using the bstrFree macro).
+
+The following related typedef is also provided:
+
+    typedef const struct tagbstring * const_bstring;
+
+which is also considered exposed.  These are directly bstring compatible (no
+casting required) but are just used for parameters which are meant to be
+non-mutable.  So in general, bstring parameters which are read as input but
+not meant to be modified will be declared as const_bstring, and bstring
+parameters which may be modified will be declared as bstring.  This convention
+is recommended for user written functions as well.
+
+Since bstrings maintain interoperability with C library char-buffer style
+strings, all functions which modify, update or create bstrings also append a
+'\0' character into the position slen + 1.  This trailing '\0' character is
+not required for bstrings input to the bstring functions; this is provided
+solely as a convenience for interoperability with standard C char-buffer
+functionality.
+
+Analogs for the ANSI C string library functions have been created when they
+are necessary, but have also been left out when they are not.  In particular
+there are no functions analogous to fwrite, or puts just for the purposes of
+bstring.  The ->data member of any string is exposed, and therefore can be
+used just as easily as char buffers for C functions which read strings.
+
+For those that wish to hand construct bstrings, the following should be kept
+in mind:
+
+    1) While bstrlib can accept constructed bstrings without terminating
+       '\0' characters, the rest of the C language string library will not
+       function properly on such non-terminated strings.  This is obvious
+       but must be kept in mind.
+    2) If it is intended that a constructed bstring be written to by the
+       bstring library functions then the data portion should be allocated
+       by the malloc function and the slen and mlen fields should be entered
+       properly.  The struct tagbstring header is not reallocated, and only
+       freed by bdestroy.
+    3) Writing arbitrary '\0' characters at various places in the string
+       will not modify its length as perceived by the bstring library
+       functions.  In fact, '\0' is a legitimate non-terminating character
+       for a bstring to contain.
+    4) For read only parameters, bstring functions do not check the mlen.
+       I.e., the minimal correctness requirements are reduced to:
+
+            (slen >= 0 && data != NULL)
+
+Better pointer arithmetic
+-------------------------
+
+One built-in feature of '\0' terminated char * strings, is that its very easy
+and fast to obtain a reference to the tail of any string using pointer
+arithmetic.  Bstrlib does one better by providing a way to get a reference to
+any substring of a bstring (or any other length delimited block of memory.)
+So rather than just having pointer arithmetic, with bstrlib one essentially
+has segment arithmetic.  This is achieved using the macro blk2tbstr() which
+builds a reference to a block of memory and the macro bmid2tbstr() which
+builds a reference to a segment of a bstring.  Bstrlib also includes
+functions for direct consumption of memory blocks into bstrings, namely
+bcatblk () and blk2bstr ().
+
+One scenario where this can be extremely useful is when string contains many
+substrings which one would like to pass as read-only reference parameters to
+some string consuming function without the need to allocate entire new
+containers for the string data.  More concretely, imagine parsing a command
+line string whose parameters are space delimited.  This can only be done for
+tails of the string with '\0' terminated char * strings.
+
+Improved NULL semantics and error handling
+------------------------------------------
+
+Unless otherwise noted, if a NULL pointer is passed as a bstring or any other
+detectably illegal parameter, the called function will return with an error
+indicator (either NULL or BSTR_ERR) rather than simply performing a NULL
+pointer access, or having undefined behavior.
+
+To illustrate the value of this, consider the following example:
+
+        strcpy (p = malloc (13 * sizeof (char)), "Hello,");
+        strcat (p, " World");
+
+This is not correct because malloc may return NULL (due to an out of memory
+condition), and the behaviour of strcpy is undefined if either of its
+parameters are NULL.  However:
+
+        bstrcat (p = bfromcstr ("Hello,"), q = bfromcstr (" World"));
+        bdestroy (q);
+
+is well defined, because if either p or q are assigned NULL (indicating a
+failure to allocate memory) both bstrcat and bdestroy will recognize it and
+perform no detrimental action.
+
+Note that it is not necessary to check any of the members of a returned
+bstring for internal correctness (in particular the data member does not need
+to be checked against NULL when the header is non-NULL), since this is
+assured by the bstring library itself.
+
+bStreams
+--------
+
+In addition to the bgets and bread functions, bstrlib can abstract streams
+with a high performance read only stream called a bStream.  In general, the
+idea is to open a core stream (with something like fopen) then pass its
+handle as well as a bNread function pointer (like fread) to the bsopen
+function which will return a handle to an open bStream.  Then the functions
+bsread, bsreadln or bsreadlns can be called to read portions of the stream.
+Finally, the bsclose function is called to close the bStream -- it will
+return a handle to the original (core) stream.  So bStreams, essentially,
+wrap other streams.
+
+The bStreams have two main advantages over the bgets and bread (as well as
+fgets/ungetc) paradigms:
+
+1) Improved functionality via the bunread function which allows a stream to
+   unread characters, giving the bStream stack-like functionality if so
+   desired.
+2) A very high performance bsreadln function.  The C library function fgets()
+   (and the bgets function) can typically be written as a loop on top of
+   fgetc(), thus paying all of the overhead costs of calling fgetc on a per
+   character basis.  bsreadln will read blocks at a time, thus amortizing the
+   overhead of fread calls over many characters at once.
+
+However, clearly bStreams are suboptimal or unusable for certain kinds of
+streams (stdin) or certain usage patterns (a few spotty, or non-sequential
+reads from a slow stream.)  For those situations, using bgets will be more
+appropriate.
+
+The semantics of bStreams allows practical construction of layerable data
+streams.  What this means is that by writing a bNread compatible function on
+top of a bStream, one can construct a new bStream on top of it.  This can be
+useful for writing multi-pass parsers that don't actually read the entire
+input more than once and don't require the use of intermediate storage.
+
+Aliasing
+--------
+
+Aliasing occurs when a function is given two parameters which point to data
+structures which overlap in the memory they occupy.  While this does not
+disturb read only functions, for many libraries this can make functions that
+write to these memory locations malfunction.  This is a common problem of the
+C standard library and especially the string functions in the C standard
+library.
+
+The C standard string library is entirely char by char oriented (as is
+bstring) which makes conforming implementations alias safe for some
+scenarios.  However no actual detection of aliasing is typically performed,
+so it is easy to find cases where the aliasing will cause anomolous or
+undesirable behaviour (consider: strcat (p, p).)  The C99 standard includes
+the "restrict" pointer modifier which allows the compiler to document and
+assume a no-alias condition on usage.  However, only the most trivial cases
+can be caught (if at all) by the compiler at compile time, and thus there is
+no actual enforcement of non-aliasing.
+
+Bstrlib, by contrast, permits aliasing and is completely aliasing safe, in
+the C99 sense of aliasing.  That is to say, under the assumption that
+pointers of incompatible types from distinct objects can never alias, bstrlib
+is completely aliasing safe.  (In practice this means that the data buffer
+portion of any bstring and header of any bstring are assumed to never alias.)
+With the exception of the reference building macros, the library behaves as
+if all read-only parameters are first copied and replaced by temporary
+non-aliased parameters before any writing to any output bstring is performed
+(though actual copying is extremely rarely ever done.)
+
+Besides being a useful safety feature, bstring searching/comparison
+functions can improve to O(1) execution when aliasing is detected.
+
+Note that aliasing detection and handling code in Bstrlib is generally
+extremely cheap.  There is almost never any appreciable performance penalty
+for using aliased parameters.
+
+Reenterancy
+-----------
+
+Nearly every function in Bstrlib is a leaf function, and is completely
+reenterable with the exception of writing to common bstrings.  The split
+functions which use a callback mechanism requires only that the source string
+not be destroyed by the callback function unless the callback function returns
+with an error status (note that Bstrlib functions which return an error do
+not modify the string in any way.)  The string can in fact be modified by the
+callback and the behaviour is deterministic.  See the documentation of the
+various split functions for more details.
+
+Undefined scenarios
+-------------------
+
+One of the basic important premises for Bstrlib is to not to increase the
+propogation of undefined situations from parameters that are otherwise legal
+in of themselves.  In particular, except for extremely marginal cases, usages
+of bstrings that use the bstring library functions alone cannot lead to any
+undefined action.  But due to C/C++ language and library limitations, there
+is no way to define a non-trivial library that is completely without
+undefined operations.  All such possible undefined operations are described
+below:
+
+1) bstrings or struct tagbstrings that are not explicitely initialized cannot
+   be passed as a parameter to any bstring function.
+2) The members of the NULL bstring cannot be accessed directly.  (Though all
+   APIs and macros detect the NULL bstring.)
+3) A bstring whose data member has not been obtained from a malloc or
+   compatible call and which is write accessible passed as a writable
+   parameter will lead to undefined results.  (i.e., do not writeAllow any
+   constructed bstrings unless the data portion has been obtained from the
+   heap.)
+4) If the headers of two strings alias but are not identical (which can only
+   happen via a defective manual construction), then passing them to a
+   bstring function in which one is writable is not defined.
+5) If the mlen member is larger than the actual accessible length of the data
+   member for a writable bstring, or if the slen member is larger than the
+   readable length of the data member for a readable bstring, then the
+   corresponding bstring operations are undefined.
+6) Any bstring definition whose header or accessible data portion has been
+   assigned to inaccessible or otherwise illegal memory clearly cannot be
+   acted upon by the bstring library in any way.
+7) Destroying the source of an incremental split from within the callback
+   and not returning with a negative value (indicating that it should abort)
+   will lead to undefined behaviour.  (Though *modifying* or adjusting the
+   state of the source data, even if those modification fail within the
+   bstrlib API, has well defined behavior.)
+8) Modifying a bstring which is write protected by direct access has
+   undefined behavior.
+
+While this may seem like a long list, with the exception of invalid uses of
+the writeAllow macro, and source destruction during an iterative split
+without an accompanying abort, no usage of the bstring API alone can cause
+any undefined scenario to occurr.  I.e., the policy of restricting usage of
+bstrings to the bstring API can significantly reduce the risk of runtime
+errors (in practice it should eliminate them) related to string manipulation
+due to undefined action.
+
+C++ wrapper
+-----------
+
+A C++ wrapper has been created to enable bstring functionality for C++ in the
+most natural (for C++ programers) way possible.  The mandate for the C++
+wrapper is different from the base C bstring library.  Since the C++ language
+has far more abstracting capabilities, the CBString structure is considered
+fully abstracted -- i.e., hand generated CBStrings are not supported (though
+conversion from a struct tagbstring is allowed) and all detectable errors are
+manifest as thrown exceptions.
+
+- The C++ class definitions are all under the namespace Bstrlib.  bstrwrap.h
+  enables this namespace (with a using namespace Bstrlib; directive at the
+  end) unless the macro BSTRLIB_DONT_ASSUME_NAMESPACE has been defined before
+  it is included.
+
+- Erroneous accesses results in an exception being thrown.  The exception
+  parameter is of type "struct CBStringException" which is derived from
+  std::exception if STL is used.  A verbose description of the error message
+  can be obtained from the what() method.
+
+- CBString is a C++ structure derived from a struct tagbstring.  An address
+  of a CBString cast to a bstring must not be passed to bdestroy.  The bstring
+  C API has been made C++ safe and can be used directly in a C++ project.
+
+- It includes constructors which can take a char, '\0' terminated char
+  buffer, tagbstring, (char, repeat-value), a length delimited buffer or a
+  CBStringList to initialize it.
+
+- Concatenation is performed with the + and += operators.  Comparisons are
+  done with the ==, !=, <, >, <= and >= operators.  Note that == and != use
+  the biseq call, while <, >, <= and >= use bstrcmp.
+
+- CBString's can be directly cast to const character buffers.
+
+- CBString's can be directly cast to double, float, int or unsigned int so
+  long as the CBString are decimal representations of those types (otherwise
+  an exception will be thrown).  Converting the other way should be done with
+  the format(a) method(s).
+
+- CBString contains the length, character and [] accessor methods.  The
+  character and [] accessors are aliases of each other.  If the bounds for
+  the string are exceeded, an exception is thrown.  To avoid the overhead for
+  this check, first cast the CBString to a (const char *) and use [] to
+  dereference the array as normal.  Note that the character and [] accessor
+  methods allows both reading and writing of individual characters.
+
+- The methods: format, formata, find, reversefind, findcaseless,
+  reversefindcaseless, midstr, insert, insertchrs, replace, findreplace,
+  findreplacecaseless, remove, findchr, nfindchr, alloc, toupper, tolower,
+  gets, read are analogous to the functions that can be found in the C API.
+
+- The caselessEqual and caselessCmp methods are analogous to biseqcaseless
+  and bstricmp functions respectively.
+
+- Note that just like the bformat function, the format and formata methods do
+  not automatically cast CBStrings into char * strings for "%s"-type
+  substitutions:
+
+    CBString w("world");
+    CBString h("Hello");
+    CBString hw;
+
+    /* The casts are necessary */
+    hw.format ("%s, %s", (const char *)h, (const char *)w);
+
+- The methods trunc and repeat have been added instead of using pattern.
+
+- ltrim, rtrim and trim methods have been added.  These remove characters
+  from a given character string set (defaulting to the whitespace characters)
+  from either the left, right or both ends of the CBString, respectively.
+
+- The method setsubstr is also analogous in functionality to bsetstr, except
+  that it cannot be passed NULL.  Instead the method fill and the fill-style
+  constructor have been supplied to enable this functionality.
+
+- The writeprotect(), writeallow() and iswriteprotected() methods are
+  analogous to the bwriteprotect(), bwriteallow() and biswriteprotected()
+  macros in the C API.  Write protection semantics in CBString are stronger
+  than with the C API in that indexed character assignment is checked for
+  write protection.  However, unlike with the C API, a write protected
+  CBString can be destroyed by the destructor.
+
+- CBStream is a C++ structure which wraps a struct bStream (its not derived
+  from it, since destruction is slightly different).  It is constructed by
+  passing in a bNread function pointer and a stream parameter cast to void *.
+  This structure includes methods for detecting eof, setting the buffer
+  length, reading the whole stream or reading entries line by line or block
+  by block, an unread function, and a peek function.
+
+- If STL is available, the CBStringList structure is derived from a vector of
+  CBString with various split methods.  The split method has been overloaded
+  to accept either a character or CBString as the second parameter (when the
+  split parameter is a CBString any character in that CBString is used as a
+  seperator).  The splitstr method takes a CBString as a substring seperator.
+  Joins can be performed via a CBString constructor which takes a
+  CBStringList as a parameter, or just using the CBString::join() method.
+
+- If there is proper support for std::iostreams, then the >> and << operators
+  and the getline() function have been added (with semantics the same as
+  those for std::string).
+
+Multithreading
+--------------
+
+A mutable bstring is kind of analogous to a small (two entry) linked list
+allocated by malloc, with all aliasing completely under programmer control.
+I.e., manipulation of one bstring will never affect any other distinct
+bstring unless explicitely constructed to do so by the programmer via hand
+construction or via building a reference.  Bstrlib also does not use any
+static or global storage, so there are no hidden unremovable race conditions.
+Bstrings are also clearly not inherently thread local.  So just like
+char *'s, bstrings can be passed around from thread to thread and shared and
+so on, so long as modifications to a bstring correspond to some kind of
+exclusive access lock as should be expected (or if the bstring is read-only,
+which can be enforced by bstring write protection) for any sort of shared
+object in a multithreaded environment.
+
+Bsafe module
+------------
+
+For convenience, a bsafe module has been included.  The idea is that if this
+module is included, inadvertant usage of the most dangerous C functions will
+be overridden and lead to an immediate run time abort.  Of course, it should
+be emphasized that usage of this module is completely optional.  The
+intention is essentially to provide an option for creating project safety
+rules which can be enforced mechanically rather than socially.  This is
+useful for larger, or open development projects where its more difficult to
+enforce social rules or "coding conventions".
+
+Problems not solved
+-------------------
+
+Bstrlib is written for the C and C++ languages, which have inherent weaknesses
+that cannot be easily solved:
+
+1. Memory leaks:  Forgetting to call bdestroy on a bstring that is about to be
+   unreferenced, just as forgetting to call free on a heap buffer that is
+   about to be dereferenced.  Though bstrlib itself is leak free.
+2. Read before write usage:  In C, declaring an auto bstring does not
+   automatically fill it with legal/valid contents.  This problem has been
+   somewhat mitigated in C++.  (The bstrDeclare and bstrFree macros from
+   bstraux can be used to help mitigate this problem.)
+
+Other problems not addressed:
+
+3. Built-in mutex usage to automatically avoid all bstring internal race
+   conditions in multitasking environments: The problem with trying to
+   implement such things at this low a level is that it is typically more
+   efficient to use locks in higher level primitives. There is also no
+   platform independent way to implement locks or mutexes.
+4. Unicode/widecharacter support.
+
+Note that except for spotty support of wide characters, the default C
+standard library does not address any of these problems either.
+
+Configurable compilation options
+--------------------------------
+
+All configuration options are meant solely for the purpose of compiler
+compatibility.  Configuration options are not meant to change the semantics
+or capabilities of the library, except where it is unavoidable.
+
+Since some C++ compilers don't include the Standard Template Library and some
+have the options of disabling exception handling, a number of macros can be
+used to conditionally compile support for each of this:
+
+BSTRLIB_CAN_USE_STL
+
+  - defining this will enable the used of the Standard Template Library.
+    Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CANNOT_USE_STL
+
+  - defining this will disable the use of the Standard Template Library.
+    Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CAN_USE_IOSTREAM
+
+  - defining this will enable the used of streams from class std.  Defining
+    BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_CANNOT_USE_IOSTREAM
+
+  - defining this will disable the use of streams from class std.  Defining
+    BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_THROWS_EXCEPTIONS
+
+  - defining this will enable the exception handling within bstring.
+    Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+    BSTRLIB_DOESNT_THROWS_EXCEPTIONS macro.
+
+BSTRLIB_DOESNT_THROW_EXCEPTIONS
+
+  - defining this will disable the exception handling within bstring.
+    Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+    BSTRLIB_DOESNT_THROW_EXCEPTIONS macro.
+
+Note that these macros must be defined consistently throughout all modules
+that use CBStrings including bstrwrap.cpp.
+
+Some older C compilers do not support functions such as vsnprintf.  This is
+handled by the following macro variables:
+
+BSTRLIB_NOVSNP
+
+  - defining this indicates that the compiler does not support vsnprintf.
+    This will cause bformat and bformata to not be declared.  Note that
+    for some compilers, such as Turbo C, this is set automatically.
+    Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+BSTRLIB_VSNP_OK
+
+  - defining this will disable the autodetection of compilers that do not
+    vsnprintf.
+    Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+Semantic compilation options
+----------------------------
+
+Bstrlib comes with very few compilation options for changing the semantics of
+of the library.  These are described below.
+
+BSTRLIB_DONT_ASSUME_NAMESPACE
+
+  - Defining this before including bstrwrap.h will disable the automatic
+    enabling of the Bstrlib namespace for the C++ declarations.
+
+BSTRLIB_DONT_USE_VIRTUAL_DESTRUCTOR
+
+  - Defining this will make the CBString destructor non-virtual.
+
+BSTRLIB_MEMORY_DEBUG
+
+  - Defining this will cause the bstrlib modules bstrlib.c and bstrwrap.cpp
+    to invoke a #include "memdbg.h".  memdbg.h has to be supplied by the user.
+
+Note that these macros must be defined consistently throughout all modules
+that use bstrings or CBStrings including bstrlib.c, bstraux.c and
+bstrwrap.cpp.
+
+===============================================================================
+
+Files
+-----
+
+bstrlib.c       - C implementaion of bstring functions.
+bstrlib.h       - C header file for bstring functions.
+bstraux.c       - C example that implements trivial additional functions.
+bstraux.h       - C header for bstraux.c
+bstest.c        - C unit/regression test for bstrlib.c
+
+bstrwrap.cpp    - C++ implementation of CBString.
+bstrwrap.h      - C++ header file for CBString.
+test.cpp        - C++ unit/regression test for bstrwrap.cpp
+
+bsafe.c         - C runtime stubs to abort usage of unsafe C functions.
+bsafe.h         - C header file for bsafe.c functions.
+
+C projects need only include bstrlib.h and compile/link bstrlib.c to use the
+bstring library.  C++ projects need to additionally include bstrwrap.h and
+compile/link bstrwrap.cpp.  For both, there may be a need to make choices
+about feature configuration as described in the "Configurable compilation
+options" in the section above.
+
+Other files that are included in this archive are:
+
+license.txt     - The BSD license for Bstrlib
+gpl.txt         - The GPL version 2
+security.txt    - A security statement useful for auditting Bstrlib
+porting.txt     - A guide to porting Bstrlib
+bstrlib.txt     - This file
+
+===============================================================================
+
+The functions
+-------------
+
+    extern bstring bfromcstr (const char * str);
+
+    Take a standard C library style '\0' terminated char buffer and generate
+    a bstring with the same contents as the char buffer.  If an error occurs
+    NULL is returned.
+
+    So for example:
+
+    bstring b = bfromcstr ("Hello");
+    if (!b) {
+        fprintf (stderr, "Out of memory");
+    } else {
+        puts ((char *) b->data);
+    }
+
+    ..........................................................................
+
+    extern bstring bfromcstralloc (int mlen, const char * str);
+
+    Create a bstring which contains the contents of the '\0' terminated
+    char * buffer str.  The memory buffer backing the bstring is at least
+    mlen characters in length.  If an error occurs NULL is returned.
+
+    So for example:
+
+    bstring b = bfromcstralloc (64, someCstr);
+    if (b) b->data[63] = 'x';
+
+    The idea is that this will set the 64th character of b to 'x' if it is at
+    least 64 characters long otherwise do nothing.  And we know this is well
+    defined so long as b was successfully created, since it will have been
+    allocated with at least 64 characters.
+
+    ..........................................................................
+
+    extern bstring blk2bstr (const void * blk, int len);
+
+    Create a bstring whose contents are described by the contiguous buffer
+    pointing to by blk with a length of len bytes.  Note that this function
+    creates a copy of the data in blk, rather than simply referencing it.
+    Compare with the blk2tbstr macro.  If an error occurs NULL is returned.
+
+    ..........................................................................
+
+    extern char * bstr2cstr (const_bstring s, char z);
+
+    Create a '\0' terminated char buffer which contains the contents of the
+    bstring s, except that any contained '\0' characters are converted to the
+    character in z.  This returned value should be freed with bcstrfree(), by
+    the caller.  If an error occurs NULL is returned.
+
+    ..........................................................................
+
+    extern int bcstrfree (char * s);
+
+    Frees a C-string generated by bstr2cstr ().  This is normally unnecessary
+    since it just wraps a call to free (), however, if malloc () and free ()
+    have been redefined as a macros within the bstrlib module (via macros in
+    the memdbg.h backdoor) with some difference in behaviour from the std
+    library functions, then this allows a correct way of freeing the memory
+    that allows higher level code to be independent from these macro
+    redefinitions.
+
+    ..........................................................................
+
+    extern bstring bstrcpy (const_bstring b1);
+
+    Make a copy of the passed in bstring.  The copied bstring is returned if
+    there is no error, otherwise NULL is returned.
+
+    ..........................................................................
+
+    extern int bassign (bstring a, const_bstring b);
+
+    Overwrite the bstring a with the contents of bstring b.  Note that the
+    bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    int bassigncstr (bstring a, const char * str);
+
+    Overwrite the string a with the contents of char * string str.  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a may be partially overwritten.
+
+    ..........................................................................
+
+    int bassignblk (bstring a, const void * s, int len);
+
+    Overwrite the string a with the contents of the block (s, len).  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
+
+    Overwrite the bstring a with the middle of contents of bstring b
+    starting from position left and running for a length len.  left and
+    len are clamped to the ends of b as with the function bmidstr.  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    extern bstring bmidstr (const_bstring b, int left, int len);
+
+    Create a bstring which is the substring of b starting from position left
+    and running for a length len (clamped by the end of the bstring b.)  If
+    there was no error, the value of this constructed bstring is returned
+    otherwise NULL is returned.
+
+    ..........................................................................
+
+    extern int bdelete (bstring s1, int pos, int len);
+
+    Removes characters from pos to pos+len-1 and shifts the tail of the
+    bstring starting from pos+len to pos.  len must be positive for this call
+    to have any effect.  The section of the bstring described by (pos, len)
+    is clamped to boundaries of the bstring b.  The value BSTR_OK is returned
+    if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bconcat (bstring b0, const_bstring b1);
+
+    Concatenate the bstring b1 to the end of bstring b0.  The value BSTR_OK
+    is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bconchar (bstring b, char c);
+
+    Concatenate the character c to the end of bstring b.  The value BSTR_OK
+    is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bcatcstr (bstring b, const char * s);
+
+    Concatenate the char * string s to the end of bstring b.  The value
+    BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bcatblk (bstring b, const void * s, int len);
+
+    Concatenate a fixed length buffer (s, len) to the end of bstring b.  The
+    value BSTR_OK is returned if the operation is successful, otherwise
+    BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int biseq (const_bstring b0, const_bstring b1);
+
+    Compare the bstring b0 and b1 for equality.  If the bstrings differ, 0
+    is returned, if the bstrings are the same, 1 is returned, if there is an
+    error, -1 is returned.  If the length of the bstrings are different, this
+    function has O(1) complexity.  Contained '\0' characters are not treated
+    as a termination character.
+
+    Note that the semantics of biseq are not completely compatible with
+    bstrcmp because of its different treatment of the '\0' character.
+
+    ..........................................................................
+
+    extern int bisstemeqblk (const_bstring b, const void * blk, int len);
+
+    Compare beginning of bstring b0 with a block of memory of length len for
+    equality.  If the beginning of b0 differs from the memory block (or if b0
+    is too short), 0 is returned, if the bstrings are the same, 1 is returned,
+    if there is an error, -1 is returned.
+
+    ..........................................................................
+
+    extern int biseqcaseless (const_bstring b0, const_bstring b1);
+
+    Compare two bstrings for equality without differentiating between case.
+    If the bstrings differ other than in case, 0 is returned, if the bstrings
+    are the same, 1 is returned, if there is an error, -1 is returned.  If
+    the length of the bstrings are different, this function is O(1).  '\0'
+    termination characters are not treated in any special way.
+
+    ..........................................................................
+
+    extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
+
+    Compare beginning of bstring b0 with a block of memory of length len
+    without differentiating between case for equality.  If the beginning of b0
+    differs from the memory block other than in case (or if b0 is too short),
+    0 is returned, if the bstrings are the same, 1 is returned, if there is an
+    error, -1 is returned.
+
+    ..........................................................................
+
+    extern int biseqcstr (const_bstring b, const char *s);
+
+    Compare the bstring b and char * bstring s.  The C string s must be '\0'
+    terminated at exactly the length of the bstring b, and the contents
+    between the two must be identical with the bstring b with no '\0'
+    characters for the two contents to be considered equal.  This is
+    equivalent to the condition that their current contents will be always be
+    equal when comparing them in the same format after converting one or the
+    other.  If they are equal 1 is returned, if they are unequal 0 is
+    returned and if there is a detectable error BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int biseqcstrcaseless (const_bstring b, const char *s);
+
+    Compare the bstring b and char * string s.  The C string s must be '\0'
+    terminated at exactly the length of the bstring b, and the contents
+    between the two must be identical except for case with the bstring b with
+    no '\0' characters for the two contents to be considered equal.  This is
+    equivalent to the condition that their current contents will be always be
+    equal ignoring case when comparing them in the same format after
+    converting one or the other.  If they are equal, except for case, 1 is
+    returned, if they are unequal regardless of case 0 is returned and if
+    there is a detectable error BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bstrcmp (const_bstring b0, const_bstring b1);
+
+    Compare the bstrings b0 and b1 for ordering.  If there is an error,
+    SHRT_MIN is returned, otherwise a value less than or greater than zero,
+    indicating that the bstring pointed to by b0 is lexicographically less
+    than or greater than the bstring pointed to by b1 is returned.  If the
+    bstring lengths are unequal but the characters up until the length of the
+    shorter are equal then a value less than, or greater than zero,
+    indicating that the bstring pointed to by b0 is shorter or longer than the
+    bstring pointed to by b1 is returned.  0 is returned if and only if the
+    two bstrings are the same.  If the length of the bstrings are different,
+    this function is O(n).  Like its standard C library counter part, the
+    comparison does not proceed past any '\0' termination characters
+    encountered.
+
+    The seemingly odd error return value, merely provides slightly more
+    granularity than the undefined situation given in the C library function
+    strcmp.  The function otherwise behaves very much like strcmp().
+
+    Note that the semantics of bstrcmp are not completely compatible with
+    biseq because of its different treatment of the '\0' termination
+    character.
+
+    ..........................................................................
+
+    extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
+
+    Compare the bstrings b0 and b1 for ordering for at most n characters.  If
+    there is an error, SHRT_MIN is returned, otherwise a value is returned as
+    if b0 and b1 were first truncated to at most n characters then bstrcmp
+    was called with these new bstrings are paremeters.  If the length of the
+    bstrings are different, this function is O(n).  Like its standard C
+    library counter part, the comparison does not proceed past any '\0'
+    termination characters encountered.
+
+    The seemingly odd error return value, merely provides slightly more
+    granularity than the undefined situation given in the C library function
+    strncmp.  The function otherwise behaves very much like strncmp().
+
+    ..........................................................................
+
+    extern int bstricmp (const_bstring b0, const_bstring b1);
+
+    Compare two bstrings without differentiating between case.  The return
+    value is the difference of the values of the characters where the two
+    bstrings first differ, otherwise 0 is returned indicating that the
+    bstrings are equal.  If the lengths are different, then a difference from
+    0 is given, but if the first extra character is '\0', then it is taken to
+    be the value UCHAR_MAX+1.
+
+    ..........................................................................
+
+    extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
+
+    Compare two bstrings without differentiating between case for at most n
+    characters.  If the position where the two bstrings first differ is
+    before the nth position, the return value is the difference of the values
+    of the characters, otherwise 0 is returned.  If the lengths are different
+    and less than n characters, then a difference from 0 is given, but if the
+    first extra character is '\0', then it is taken to be the value
+    UCHAR_MAX+1.
+
+    ..........................................................................
+
+    extern int bdestroy (bstring b);
+
+    Deallocate the bstring passed.  Passing NULL in as a parameter will have
+    no effect.  Note that both the header and the data portion of the bstring
+    will be freed.  No other bstring function which modifies one of its
+    parameters will free or reallocate the header.  Because of this, in
+    general, bdestroy cannot be called on any declared struct tagbstring even
+    if it is not write protected.  A bstring which is write protected cannot
+    be destroyed via the bdestroy call.  Any attempt to do so will result in
+    no action taken, and BSTR_ERR will be returned.
+
+    Note to C++ users: Passing in a CBString cast to a bstring will lead to
+    undefined behavior (free will be called on the header, rather than the
+    CBString destructor.)  Instead just use the ordinary C++ language
+    facilities to dealloc a CBString.
+
+    ..........................................................................
+
+    extern int binstr (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    forward (increasing) direction.  If it is found then it returns with the
+    first position after pos where it is found, otherwise it returns BSTR_ERR.
+    The algorithm used is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binstrr (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    backward (decreasing) direction.  If it is found then it returns with the
+    first position after pos where it is found, otherwise return BSTR_ERR.
+    Note that the current position at pos is tested as well -- so to be
+    disjoint from a previous forward search it is recommended that the
+    position be backed up (decremented) by one position.  The algorithm used
+    is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    forward (increasing) direction but without regard to case.  If it is
+    found then it returns with the first position after pos where it is
+    found, otherwise it returns BSTR_ERR. The algorithm used is brute force;
+    O(m*n).
+
+    ..........................................................................
+
+    extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    backward (decreasing) direction but without regard to case.  If it is
+    found then it returns with the first position after pos where it is
+    found, otherwise return BSTR_ERR. Note that the current position at pos
+    is tested as well -- so to be disjoint from a previous forward search it
+    is recommended that the position be backed up (decremented) by one
+    position.  The algorithm used is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binchr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the first position in b0 starting from pos or after, in which
+    one of the characters in b1 is found.  This function has an execution
+    time of O(b0->slen + b1->slen).  If such a position does not exist in b0,
+    then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int binchrr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the last position in b0 no greater than pos, in which one of
+    the characters in b1 is found.  This function has an execution time
+    of O(b0->slen + b1->slen).  If such a position does not exist in b0,
+    then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bninchr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the first position in b0 starting from pos or after, in which
+    none of the characters in b1 is found and return it.  This function has
+    an execution time of O(b0->slen + b1->slen).  If such a position does
+    not exist in b0, then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the last position in b0 no greater than pos, in which none of
+    the characters in b1 is found and return it.  This function has an
+    execution time of O(b0->slen + b1->slen).  If such a position does not
+    exist in b0, then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bstrchr (const_bstring b, int c);
+
+    Search for the character c in the bstring b forwards from the start of
+    the bstring.  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    NOTE: This has been implemented as a macro on top of bstrchrp ().
+
+    ..........................................................................
+
+    extern int bstrrchr (const_bstring b, int c);
+
+    Search for the character c in the bstring b backwards from the end of the
+    bstring.  Returns the position of the found character or BSTR_ERR if it is
+    not found.
+
+    NOTE: This has been implemented as a macro on top of bstrrchrp ().
+
+    ..........................................................................
+
+    extern int bstrchrp (const_bstring b, int c, int pos);
+
+    Search for the character c in b forwards from the position pos
+    (inclusive).  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    ..........................................................................
+
+    extern int bstrrchrp (const_bstring b, int c, int pos);
+
+    Search for the character c in b backwards from the position pos in bstring
+    (inclusive).  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    ..........................................................................
+
+    extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
+
+    Overwrite the bstring b0 starting at position pos with the bstring b1. If
+    the position pos is past the end of b0, then the character "fill" is
+    appended as necessary to make up the gap between the end of b0 and pos.
+    If b1 is NULL, it behaves as if it were a 0-length bstring. The value
+    BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
+
+    Inserts the bstring s2 into s1 at position pos.  If the position pos is
+    past the end of s1, then the character "fill" is appended as necessary to
+    make up the gap between the end of s1 and pos.  The value BSTR_OK is
+    returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
+
+    Inserts the character fill repeatedly into s1 at position pos for a
+    length len.  If the position pos is past the end of s1, then the
+    character "fill" is appended as necessary to make up the gap between the
+    end of s1 and the position pos + len (exclusive).  The value BSTR_OK is
+    returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int breplace (bstring b1, int pos, int len, const_bstring b2,
+                         unsigned char fill);
+
+    Replace a section of a bstring from pos for a length len with the bstring
+    b2. If the position pos is past the end of b1 then the character "fill"
+    is appended as necessary to make up the gap between the end of b1 and
+    pos.
+
+    ..........................................................................
+
+    extern int bfindreplace (bstring b, const_bstring find,
+                             const_bstring replace, int position);
+
+    Replace all occurrences of the find substring with a replace bstring
+    after a given position in the bstring b.  The find bstring must have a
+    length > 0 otherwise BSTR_ERR is returned.  This function does not
+    perform recursive per character replacement; that is to say successive
+    searches resume at the position after the last replace.
+
+    So for example:
+
+        bfindreplace (a0 = bfromcstr("aabaAb"), a1 = bfromcstr("a"),
+                      a2 = bfromcstr("aa"), 0);
+
+    Should result in changing a0 to "aaaabaaAb".
+
+    This function performs exactly (b->slen - position) bstring comparisons,
+    and data movement is bounded above by character volume equivalent to size
+    of the output bstring.
+
+    ..........................................................................
+
+    extern int bfindreplacecaseless (bstring b, const_bstring find,
+                             const_bstring replace, int position);
+
+    Replace all occurrences of the find substring, ignoring case, with a
+    replace bstring after a given position in the bstring b.  The find bstring
+    must have a length > 0 otherwise BSTR_ERR is returned.  This function
+    does not perform recursive per character replacement; that is to say
+    successive searches resume at the position after the last replace.
+
+    So for example:
+
+        bfindreplacecaseless (a0 = bfromcstr("AAbaAb"), a1 = bfromcstr("a"),
+                              a2 = bfromcstr("aa"), 0);
+
+    Should result in changing a0 to "aaaabaaaab".
+
+    This function performs exactly (b->slen - position) bstring comparisons,
+    and data movement is bounded above by character volume equivalent to size
+    of the output bstring.
+
+    ..........................................................................
+
+    extern int balloc (bstring b, int length);
+
+    Increase the allocated memory backing the data buffer for the bstring b
+    to a length of at least length.  If the memory backing the bstring b is
+    already large enough, not action is performed.  This has no effect on the
+    bstring b that is visible to the bstring API.  Usually this function will
+    only be used when a minimum buffer size is required coupled with a direct
+    access to the ->data member of the bstring structure.
+
+    Be warned that like any other bstring function, the bstring must be well
+    defined upon entry to this function.  I.e., doing something like:
+
+        b->slen *= 2; /* ?? Most likely incorrect */
+        balloc (b, b->slen);
+
+    is invalid, and should be implemented as:
+
+        int t;
+        if (BSTR_OK == balloc (b, t = (b->slen * 2))) b->slen = t;
+
+    This function will return with BSTR_ERR if b is not detected as a valid
+    bstring or length is not greater than 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int ballocmin (bstring b, int length);
+
+    Change the amount of memory backing the bstring b to at least length.
+    This operation will never truncate the bstring data including the
+    extra terminating '\0' and thus will not decrease the length to less than
+    b->slen + 1.  Note that repeated use of this function may cause
+    performance problems (realloc may be called on the bstring more than
+    the O(log(INT_MAX)) times).  This function will return with BSTR_ERR if b
+    is not detected as a valid bstring or length is not greater than 0,
+    otherwise BSTR_OK is returned.
+
+    So for example:
+
+    if (BSTR_OK == ballocmin (b, 64)) b->data[63] = 'x';
+
+    The idea is that this will set the 64th character of b to 'x' if it is at
+    least 64 characters long otherwise do nothing.  And we know this is well
+    defined so long as the ballocmin call was successfully, since it will
+    ensure that b has been allocated with at least 64 characters.
+
+    ..........................................................................
+
+    int btrunc (bstring b, int n);
+
+    Truncate the bstring to at most n characters.  This function will return
+    with BSTR_ERR if b is not detected as a valid bstring or n is less than
+    0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bpattern (bstring b, int len);
+
+    Replicate the starting bstring, b, end to end repeatedly until it
+    surpasses len characters, then chop the result to exactly len characters.
+    This function operates in-place.  This function will return with BSTR_ERR
+    if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btoupper (bstring b);
+
+    Convert contents of bstring to upper case.  This function will return with
+    BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btolower (bstring b);
+
+    Convert contents of bstring to lower case.  This function will return with
+    BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bltrimws (bstring b);
+
+    Delete whitespace contiguous from the left end of the bstring.  This
+    function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+    BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int brtrimws (bstring b);
+
+    Delete whitespace contiguous from the right end of the bstring.  This
+    function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+    BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btrimws (bstring b);
+
+    Delete whitespace contiguous from both ends of the bstring.  This function
+    will return with BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK
+    is returned.
+
+    ..........................................................................
+
+    extern struct bstrList* bstrListCreate (void);
+
+    Create an empty struct bstrList. The struct bstrList output structure is
+    declared as follows:
+
+    struct bstrList {
+        int qty, mlen;
+        bstring * entry;
+    };
+
+    The entry field actually is an array with qty number entries.  The mlen
+    record counts the maximum number of bstring's for which there is memory
+    in the entry record.
+
+    The Bstrlib API does *NOT* include a comprehensive set of functions for
+    full management of struct bstrList in an abstracted way.  The reason for
+    this is because aliasing semantics of the list are best left to the user
+    of this function, and performance varies wildly depending on the
+    assumptions made.  For a complete list of bstring data type it is
+    recommended that the C++ public std::vector<CBString> be used, since its
+    semantics are usage are more standard.
+
+    ..........................................................................
+
+    extern int bstrListDestroy (struct bstrList * sl);
+
+    Destroy a struct bstrList structure that was returned by the bsplit
+    function.  Note that this will destroy each bstring in the ->entry array
+    as well.  See bstrListCreate() above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern int bstrListAlloc (struct bstrList * sl, int msz);
+
+    Ensure that there is memory for at least msz number of entries for the
+    list.
+
+    ..........................................................................
+
+    extern int bstrListAllocMin (struct bstrList * sl, int msz);
+
+    Try to allocate the minimum amount of memory for the list to include at
+    least msz entries or sl->qty whichever is greater.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplit (bstring str, unsigned char splitChar);
+
+    Create an array of sequential substrings from str divided by the
+    character splitChar.  Successive occurrences of the splitChar will be
+    divided by empty bstring entries, following the semantics from the Python
+    programming language.  To reclaim the memory from this output structure,
+    bstrListDestroy () should be called.  See bstrListCreate() above for
+    structure of struct bstrList.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplits (bstring str, const_bstring splitStr);
+
+    Create an array of sequential substrings from str divided by any
+    character contained in splitStr.  An empty splitStr causes a single entry
+    bstrList containing a copy of str to be returned.  See bstrListCreate()
+    above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplitstr (bstring str, const_bstring splitStr);
+
+    Create an array of sequential substrings from str divided by the entire
+    substring splitStr.  An empty splitStr causes a single entry bstrList
+    containing a copy of str to be returned.  See bstrListCreate() above for
+    structure of struct bstrList.
+
+    ..........................................................................
+
+    extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
+
+    Join the entries of a bstrList into one bstring by sequentially
+    concatenating them with the sep bstring in between.  If sep is NULL, it
+    is treated as if it were the empty bstring.  Note that:
+
+        bjoin (l = bsplit (b, s->data[0]), s);
+
+    should result in a copy of b, if s->slen is 1.  If there is an error NULL
+    is returned, otherwise a bstring with the correct result is returned.
+    See bstrListCreate() above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by the character splitChar.  The parm passed to
+    bsplitcb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitcb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitcb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitcb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplit that is
+    abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by any of the characters in splitStr.  An empty
+    splitStr causes the whole str to be iterated once.  The parm passed to
+    bsplitcb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitscb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitscb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitscb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplits that
+    is abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by the entire substring splitStr.  An empty splitStr
+    causes each character of str to be iterated.  The parm passed to bsplitcb
+    is passed on to cb.  If the function cb returns a value < 0, then further
+    iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitstrcb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitstrcb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitscb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplitstr that
+    is abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern bstring bformat (const char * fmt, ...);
+
+    Takes the same parameters as printf (), but rather than outputting
+    results to stdio, it forms a bstring which contains what would have been
+    output. Note that if there is an early generation of a '\0' character,
+    the bstring will be truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        b0 = bformat ("Hello, %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bformat function is not present.
+
+    ..........................................................................
+
+    extern int bformata (bstring b, const char * fmt, ...);
+
+    In addition to the initial output buffer b, bformata takes the same
+    parameters as printf (), but rather than outputting results to stdio, it
+    appends the results to the initial bstring parameter. Note that if
+    there is an early generation of a '\0' character, the bstring will be
+    truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        bformata (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bformata function is not present.
+
+    ..........................................................................
+
+    extern int bassignformat (bstring b, const char * fmt, ...);
+
+    After the first parameter, it takes the same parameters as printf (), but
+    rather than outputting results to stdio, it outputs the results to
+    the bstring parameter b. Note that if there is an early generation of a
+    '\0' character, the bstring will be truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        bassignformat (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bassignformat function is not present.
+
+    ..........................................................................
+
+    extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
+
+    The bvcformata function formats data under control of the format control
+    string fmt and attempts to append the result to b.  The fmt parameter is
+    the same as that of the printf function.  The variable argument list is
+    replaced with arglist, which has been initialized by the va_start macro.
+    The size of the output is upper bounded by count.  If the required output
+    exceeds count, the string b is not augmented with any contents and a value
+    below BSTR_ERR is returned.  If a value below -count is returned then it
+    is recommended that the negative of this value be used as an update to the
+    count in a subsequent pass.  On other errors, such as running out of
+    memory, parameter errors or numeric wrap around BSTR_ERR is returned.
+    BSTR_OK is returned when the output is successfully generated and
+    appended to b.
+
+    Note: There is no sanity checking of arglist, and this function is
+    destructive of the contents of b from the b->slen point onward.  If there
+    is an early generation of a '\0' character, the bstring will be truncated
+    to this end point.
+
+    Although this function is part of the external API for Bstrlib, the
+    interface and semantics (length limitations, and unusual return codes)
+    are fairly atypical.  The real purpose for this function is to provide an
+    engine for the bvformata macro.
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bvcformata function is not present.
+
+    ..........................................................................
+
+    extern bstring bread (bNread readPtr, void * parm);
+    typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem,
+                               void *parm);
+
+    Read an entire stream into a bstring, verbatum.  The readPtr function
+    pointer is compatible with fread sematics, except that it need not obtain
+    the stream data from a file.  The intention is that parm would contain
+    the stream data context/state required (similar to the role of the FILE*
+    I/O stream parameter of fread.)
+
+    Abstracting the block read function allows for block devices other than
+    file streams to be read if desired.  Note that there is an ANSI
+    compatibility issue if "fread" is used directly; see the ANSI issues
+    section below.
+
+    ..........................................................................
+
+    extern int breada (bstring b, bNread readPtr, void * parm);
+
+    Read an entire stream and append it to a bstring, verbatum.  Behaves
+    like bread, except that it appends it results to the bstring b.
+    BSTR_ERR is returned on error, otherwise 0 is returned.
+
+    ..........................................................................
+
+    extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
+    typedef int (* bNgetc) (void * parm);
+
+    Read a bstring from a stream.  As many bytes as is necessary are read
+    until the terminator is consumed or no more characters are available from
+    the stream.  If read from the stream, the terminator character will be
+    appended to the end of the returned bstring.  The getcPtr function must
+    have the same semantics as the fgetc C library function (i.e., returning
+    an integer whose value is negative when there are no more characters
+    available, otherwise the value of the next available unsigned character
+    from the stream.)  The intention is that parm would contain the stream
+    data context/state required (similar to the role of the FILE* I/O stream
+    parameter of fgets.)  If no characters are read, or there is some other
+    detectable error, NULL is returned.
+
+    bgets will never call the getcPtr function more often than necessary to
+    construct its output (including a single call, if required, to determine
+    that the stream contains no more characters.)
+
+    Abstracting the character stream function and terminator character allows
+    for different stream devices and string formats other than '\n'
+    terminated lines in a file if desired (consider \032 terminated email
+    messages, in a UNIX mailbox for example.)
+
+    For files, this function can be used analogously as fgets as follows:
+
+        fp = fopen ( ... );
+        if (fp) b = bgets ((bNgetc) fgetc, fp, '\n');
+
+    (Note that only one terminator character can be used, and that '\0' is
+    not assumed to terminate the stream in addition to the terminator
+    character. This is consistent with the semantics of fgets.)
+
+    ..........................................................................
+
+    extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+    Read from a stream and concatenate to a bstring.  Behaves like bgets,
+    except that it appends it results to the bstring b.  The value 1 is
+    returned if no characters are read before a negative result is returned
+    from getcPtr.  Otherwise BSTR_ERR is returned on error, and 0 is returned
+    in other normal cases.
+
+    ..........................................................................
+
+    extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+    Read from a stream and concatenate to a bstring.  Behaves like bgets,
+    except that it assigns the results to the bstring b.  The value 1 is
+    returned if no characters are read before a negative result is returned
+    from getcPtr.  Otherwise BSTR_ERR is returned on error, and 0 is returned
+    in other normal cases.
+
+    ..........................................................................
+
+    extern struct bStream * bsopen (bNread readPtr, void * parm);
+
+    Wrap a given open stream (described by a fread compatible function
+    pointer and stream handle) into an open bStream suitable for the bstring
+    library streaming functions.
+
+    ..........................................................................
+
+    extern void * bsclose (struct bStream * s);
+
+    Close the bStream, and return the handle to the stream that was
+    originally used to open the given stream.  If s is NULL or detectably
+    invalid, NULL will be returned.
+
+    ..........................................................................
+
+    extern int bsbufflength (struct bStream * s, int sz);
+
+    Set the length of the buffer used by the bStream.  If sz is the macro
+    BSTR_BS_BUFF_LENGTH_GET (which is 0), the length is not set.  If s is
+    NULL or sz is negative, the function will return with BSTR_ERR, otherwise
+    this function returns with the previous length.
+
+    ..........................................................................
+
+    extern int bsreadln (bstring r, struct bStream * s, char terminator);
+
+    Read a bstring terminated by the terminator character or the end of the
+    stream from the bStream (s) and return it into the parameter r.  The
+    matched terminator, if found, appears at the end of the line read.  If
+    the stream has been exhausted of all available data, before any can be
+    read, BSTR_ERR is returned.  This function may read additional characters
+    into the stream buffer from the core stream that are not returned, but
+    will be retained for subsequent read operations.  When reading from high
+    speed streams, this function can perform significantly faster than bgets.
+
+    ..........................................................................
+
+    extern int bsreadlna (bstring r, struct bStream * s, char terminator);
+
+    Read a bstring terminated by the terminator character or the end of the
+    stream from the bStream (s) and concatenate it to the parameter r.  The
+    matched terminator, if found, appears at the end of the line read.  If
+    the stream has been exhausted of all available data, before any can be
+    read, BSTR_ERR is returned.  This function may read additional characters
+    into the stream buffer from the core stream that are not returned, but
+    will be retained for subsequent read operations.  When reading from high
+    speed streams, this function can perform significantly faster than bgets.
+
+    ..........................................................................
+
+    extern int bsreadlns (bstring r, struct bStream * s, bstring terminators);
+
+    Read a bstring terminated by any character in the terminators bstring or
+    the end of the stream from the bStream (s) and return it into the
+    parameter r. This function may read additional characters from the core
+    stream that are not returned, but will be retained for subsequent read
+    operations.
+
+    ..........................................................................
+
+    extern int bsreadlnsa (bstring r, struct bStream * s, bstring terminators);
+
+    Read a bstring terminated by any character in the terminators bstring or
+    the end of the stream from the bStream (s) and concatenate it to the
+    parameter r.  If the stream has been exhausted of all available data,
+    before any can be read, BSTR_ERR is returned.  This function may read
+    additional characters from the core stream that are not returned, but
+    will be retained for subsequent read operations.
+
+    ..........................................................................
+
+    extern int bsread (bstring r, struct bStream * s, int n);
+
+    Read a bstring of length n (or, if it is fewer, as many bytes as is
+    remaining) from the bStream.  This function will read the minimum
+    required number of additional characters from the core stream.  When the
+    stream is at the end of the file BSTR_ERR is returned, otherwise BSTR_OK
+    is returned.
+
+    ..........................................................................
+
+    extern int bsreada (bstring r, struct bStream * s, int n);
+
+    Read a bstring of length n (or, if it is fewer, as many bytes as is
+    remaining) from the bStream and concatenate it to the parameter r.  This
+    function will read the minimum required number of additional characters
+    from the core stream.  When the stream is at the end of the file BSTR_ERR
+    is returned, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bsunread (struct bStream * s, const_bstring b);
+
+    Insert a bstring into the bStream at the current position.  These
+    characters will be read prior to those that actually come from the core
+    stream.
+
+    ..........................................................................
+
+    extern int bspeek (bstring r, const struct bStream * s);
+
+    Return the number of currently buffered characters from the bStream that
+    will be read prior to reads from the core stream, and append it to the
+    the parameter r.
+
+    ..........................................................................
+
+    extern int bssplitscb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+    Iterate the set of disjoint sequential substrings over the stream s
+    divided by any character from the bstring splitStr.  The parm passed to
+    bssplitscb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this return value is returned by
+    bssplitscb.
+
+    Note: At the point of calling the cb function, the bStream pointer is
+    pointed exactly at the position right after having read the split
+    character.  The cb function can act on the stream by causing the bStream
+    pointer to move, and bssplitscb will continue by starting the next split
+    at the position of the pointer after the return from cb.
+
+    However, if the cb causes the bStream s to be destroyed then the cb must
+    return with a negative value, otherwise bssplitscb will continue in an
+    undefined manner.
+
+    This function is provided as way to incrementally parse through a file
+    or other generic stream that in total size may otherwise exceed the
+    practical or desired memory available.  As with the other split callback
+    based functions this is abortable and does not impose additional memory
+    allocation.
+
+    ..........................................................................
+
+    extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+    Iterate the set of disjoint sequential substrings over the stream s
+    divided by the entire substring splitStr.  The parm passed to
+    bssplitstrcb is passed on to cb.  If the function cb returns a
+    value < 0, then further iterating is halted and this return value is
+    returned by bssplitstrcb.
+
+    Note: At the point of calling the cb function, the bStream pointer is
+    pointed exactly at the position right after having read the split
+    character.  The cb function can act on the stream by causing the bStream
+    pointer to move, and bssplitstrcb will continue by starting the next
+    split at the position of the pointer after the return from cb.
+
+    However, if the cb causes the bStream s to be destroyed then the cb must
+    return with a negative value, otherwise bssplitscb will continue in an
+    undefined manner.
+
+    This function is provided as way to incrementally parse through a file
+    or other generic stream that in total size may otherwise exceed the
+    practical or desired memory available.  As with the other split callback
+    based functions this is abortable and does not impose additional memory
+    allocation.
+
+    ..........................................................................
+
+    extern int bseof (const struct bStream * s);
+
+    Return the defacto "EOF" (end of file) state of a stream (1 if the
+    bStream is in an EOF state, 0 if not, and BSTR_ERR if stream is closed or
+    detectably erroneous.)  When the readPtr callback returns a value <= 0
+    the stream reaches its "EOF" state. Note that bunread with non-empty
+    content will essentially turn off this state, and the stream will not be
+    in its "EOF" state so long as its possible to read more data out of it.
+
+    Also note that the semantics of bseof() are slightly different from
+    something like feof().  I.e., reaching the end of the stream does not
+    necessarily guarantee that bseof() will return with a value indicating
+    that this has happened.  bseof() will only return indicating that it has
+    reached the "EOF" and an attempt has been made to read past the end of
+    the bStream.
+
+The macros
+----------
+
+    The macros described below are shown in a prototype form indicating their
+    intended usage.  Note that the parameters passed to these macros will be
+    referenced multiple times.  As with all macros, programmer care is
+    required to guard against unintended side effects.
+
+    int blengthe (const_bstring b, int err);
+
+    Returns the length of the bstring.  If the bstring is NULL err is
+    returned.
+
+    ..........................................................................
+
+    int blength (const_bstring b);
+
+    Returns the length of the bstring.  If the bstring is NULL, the length
+    returned is 0.
+
+    ..........................................................................
+
+    int bchare (const_bstring b, int p, int c);
+
+    Returns the p'th character of the bstring b.  If the position p refers to
+    a position that does not exist in the bstring or the bstring is NULL,
+    then c is returned.
+
+    ..........................................................................
+
+    char bchar (const_bstring b, int p);
+
+    Returns the p'th character of the bstring b.  If the position p refers to
+    a position that does not exist in the bstring or the bstring is NULL,
+    then '\0' is returned.
+
+    ..........................................................................
+
+    char * bdatae (bstring b, char * err);
+
+    Returns the char * data portion of the bstring b.  If b is NULL, err is
+    returned.
+
+    ..........................................................................
+
+    char * bdata (bstring b);
+
+    Returns the char * data portion of the bstring b.  If b is NULL, NULL is
+    returned.
+
+    ..........................................................................
+
+    char * bdataofse (bstring b, int ofs, char * err);
+
+    Returns the char * data portion of the bstring b offset by ofs.  If b is
+    NULL, err is returned.
+
+    ..........................................................................
+
+    char * bdataofs (bstring b, int ofs);
+
+    Returns the char * data portion of the bstring b offset by ofs.  If b is
+    NULL, NULL is returned.
+
+    ..........................................................................
+
+    struct tagbstring var = bsStatic ("...");
+
+    The bsStatic macro allows for static declarations of literal string
+    constants as struct tagbstring structures.  The resulting tagbstring does
+    not need to be freed or destroyed.  Note that this macro is only well
+    defined for string literal arguments.  For more general string pointers,
+    use the btfromcstr macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    <void * blk, int len> <- bsStaticBlkParms ("...")
+
+    The bsStaticBlkParms macro emits a pair of comma seperated parameters
+    corresponding to the block parameters for the block functions in Bstrlib
+    (i.e., blk2bstr, bcatblk, blk2tbstr, bisstemeqblk, bisstemeqcaselessblk.)
+    Note that this macro is only well defined for string literal arguments.
+
+    Examples:
+
+    bstring b = blk2bstr (bsStaticBlkParms ("Fast init. "));
+    bcatblk (b, bsStaticBlkParms ("No frills fast concatenation."));
+
+    These are faster than using bfromcstr() and bcatcstr() respectively
+    because the length of the inline string is known as a compile time
+    constant.  Also note that seperate struct tagbstring declarations for
+    holding the output of a bsStatic() macro are not required.
+
+    ..........................................................................
+
+    void btfromcstr (struct tagbstring& t, const char * s);
+
+    Fill in the tagbstring t with the '\0' terminated char buffer s.  This
+    action is purely reference oriented; no memory management is done.  The
+    data member is just assigned s, and slen is assigned the strlen of s.
+    The s parameter is accessed exactly once in this macro.
+
+    The resulting struct tagbstring is initially write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoke the
+    bwriteallow on this struct tagbstring to make it writeable (though this
+    requires that s be obtained from a function compatible with malloc.)
+
+    ..........................................................................
+
+    void btfromblk (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len.  This
+    action is purely reference oriented; no memory management is done.  The
+    data member of t is just assigned s, and slen is assigned len.  Note that
+    the buffer is not appended with a '\0' character.  The s and len
+    parameters are accessed exactly once each in this macro.
+
+    The resulting struct tagbstring is initially write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoke the
+    bwriteallow on this struct tagbstring to make it writeable (though this
+    requires that s be obtained from a function compatible with malloc.)
+
+    ..........................................................................
+
+    void btfromblkltrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been left trimmed.  This action is purely reference oriented; no
+    memory management is done.  The data member of t is just assigned to a
+    pointer inside the buffer s.  Note that the buffer is not appended with a
+    '\0' character.  The s and len parameters are accessed exactly once each
+    in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void btfromblkrtrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been right trimmed.  This action is purely reference oriented; no
+    memory management is done.  The data member of t is just assigned to a
+    pointer inside the buffer s.  Note that the buffer is not appended with a
+    '\0' character.  The s and len parameters are accessed exactly once each
+    in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void btfromblktrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been left and right trimmed.  This action is purely reference
+    oriented; no memory management is done.  The data member of t is just
+    assigned to a pointer inside the buffer s.  Note that the buffer is not
+    appended with a '\0' character.  The s and len parameters are accessed
+    exactly once each in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void bmid2tbstr (struct tagbstring& t, bstring b, int pos, int len);
+
+    Fill the tagbstring t with the substring from b, starting from position
+    pos with a length len.  The segment is clamped by the boundaries of
+    the bstring b.  This action is purely reference oriented; no memory
+    management is done.  Note that the buffer is not appended with a '\0'
+    character.  Note that the t parameter to this macro may be accessed
+    multiple times.  Note that the contents of t will become undefined
+    if the contents of b change or are destroyed.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoking the
+    bwriteallow macro on this struct tagbstring will have no effect.
+
+    ..........................................................................
+
+    void bvformata (int& ret, bstring b, const char * format, lastarg);
+
+    Append the bstring b with printf like formatting with the format control
+    string, and the arguments taken from the ... list of arguments after
+    lastarg passed to the containing function.  If the containing function
+    does not have ... parameters or lastarg is not the last named parameter
+    before the ... then the results are undefined.  If successful, the
+    results are appended to b and BSTR_OK is assigned to ret.  Otherwise
+    BSTR_ERR is assigned to ret.
+
+    Example:
+
+    void dbgerror (FILE * fp, const char * fmt, ...) {
+        int ret;
+        bstring b;
+        bvformata (ret, b = bfromcstr ("DBG: "), fmt, fmt);
+        if (BSTR_OK == ret) fputs ((char *) bdata (b), fp);
+        bdestroy (b);
+    }
+
+    Note that if the BSTRLIB_NOVSNP macro was set when bstrlib had been
+    compiled the bvformata macro will not link properly.  If the
+    BSTRLIB_NOVSNP macro has been set, the bvformata macro will not be
+    available.
+
+    ..........................................................................
+
+    void bwriteprotect (struct tagbstring& t);
+
+    Disallow bstring from being written to via the bstrlib API.  Attempts to
+    write to the resulting tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.
+
+    Note: bstrings which are write protected cannot be destroyed via bdestroy.
+
+    Note to C++ users: Setting a CBString as write protected will not prevent
+    it from being destroyed by the destructor.
+
+    ..........................................................................
+
+    void bwriteallow (struct tagbstring& t);
+
+    Allow bstring to be written to via the bstrlib API.  Note that such an
+    action makes the bstring both writable and destroyable.  If the bstring is
+    not legitimately writable (as is the case for struct tagbstrings
+    initialized with a bsStatic value), the results of this are undefined.
+
+    Note that invoking the bwriteallow macro may increase the number of
+    reallocs by one more than necessary for every call to bwriteallow
+    interleaved with any bstring API which writes to this bstring.
+
+    ..........................................................................
+
+    int biswriteprotected (struct tagbstring& t);
+
+    Returns 1 if the bstring is write protected, otherwise 0 is returned.
+
+===============================================================================
+
+The bstest module
+-----------------
+
+The bstest module is just a unit test for the bstrlib module.  For correct
+implementations of bstrlib, it should execute with 0 failures being reported.
+This test should be utilized if modifications/customizations to bstrlib have
+been performed.  It tests each core bstrlib function with bstrings of every
+mode (read-only, NULL, static and mutable) and ensures that the expected
+semantics are observed (including results that should indicate an error). It
+also tests for aliasing support.  Passing bstest is a necessary but not a
+sufficient condition for ensuring the correctness of the bstrlib module.
+
+
+The test module
+---------------
+
+The test module is just a unit test for the bstrwrap module.  For correct
+implementations of bstrwrap, it should execute with 0 failures being
+reported.  This test should be utilized if modifications/customizations to
+bstrwrap have been performed.  It tests each core bstrwrap function with
+CBStrings write protected or not and ensures that the expected semantics are
+observed (including expected exceptions.)  Note that exceptions cannot be
+disabled to run this test.  Passing test is a necessary but not a sufficient
+condition for ensuring the correctness of the bstrwrap module.
+
+===============================================================================
+
+Using Bstring and CBString as an alternative to the C library
+-------------------------------------------------------------
+
+First let us give a table of C library functions and the alternative bstring
+functions and CBString methods that should be used instead of them.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+gets              bgets                           ::gets
+strcpy            bassign                         = operator
+strncpy           bassignmidstr                   ::midstr
+strcat            bconcat                         += operator
+strncat           bconcat + btrunc                += operator + ::trunc
+strtok            bsplit, bsplits                 ::split
+sprintf           b(assign)format                 ::format
+snprintf          b(assign)format + btrunc        ::format + ::trunc
+vsprintf          bvformata                       bvformata
+
+vsnprintf         bvformata + btrunc              bvformata + btrunc
+vfprintf          bvformata + fputs               use bvformata + fputs
+strcmp            biseq, bstrcmp                  comparison operators.
+strncmp           bstrncmp, memcmp                bstrncmp, memcmp
+strlen            ->slen, blength                 ::length
+strdup            bstrcpy                         constructor
+strset            bpattern                        ::fill
+strstr            binstr                          ::find
+strpbrk           binchr                          ::findchr
+stricmp           bstricmp                        cast & use bstricmp
+strlwr            btolower                        cast & use btolower
+strupr            btoupper                        cast & use btoupper
+strrev            bReverse (aux module)           cast & use bReverse
+strchr            bstrchr                         cast & use bstrchr
+strspnp           use strspn                      use strspn
+ungetc            bsunread                        bsunread
+
+The top 9 C functions listed here are troublesome in that they impose memory
+management in the calling function.  The Bstring and CBstring interfaces have
+built-in memory management, so there is far less code with far less potential
+for buffer overrun problems.  strtok can only be reliably called as a "leaf"
+calculation, since it (quite bizarrely) maintains hidden internal state.  And
+gets is well known to be broken no matter what.  The Bstrlib alternatives do
+not suffer from those sorts of problems.
+
+The substitute for strncat can be performed with higher performance by using
+the blk2tbstr macro to create a presized second operand for bconcat.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+strspn            strspn acceptable               strspn acceptable
+strcspn           strcspn acceptable              strcspn acceptable
+strnset           strnset acceptable              strnset acceptable
+printf            printf acceptable               printf acceptable
+puts              puts acceptable                 puts acceptable
+fprintf           fprintf acceptable              fprintf acceptable
+fputs             fputs acceptable                fputs acceptable
+memcmp            memcmp acceptable               memcmp acceptable
+
+Remember that Bstring (and CBstring) functions will automatically append the
+'\0' character to the character data buffer.  So by simply accessing the data
+buffer directly, ordinary C string library functions can be called directly
+on them.  Note that bstrcmp is not the same as memcmp in exactly the same way
+that strcmp is not the same as memcmp.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+fread             balloc + fread                  ::alloc + fread
+fgets             balloc + fgets                  ::alloc + fgets
+
+These are odd ones because of the exact sizing of the buffer required.  The
+Bstring and CBString alternatives requires that the buffers are forced to
+hold at least the prescribed length, then just use fread or fgets directly.
+However, typically the automatic memory management of Bstring and CBstring
+will make the typical use of fgets and fread to read specifically sized
+strings unnecessary.
+
+Implementation Choices
+----------------------
+
+Overhead:
+.........
+
+The bstring library has more overhead versus straight char buffers for most
+functions.  This overhead is essentially just the memory management and
+string header allocation.  This overhead usually only shows up for small
+string manipulations.  The performance loss has to be considered in
+light of the following:
+
+1) What would be the performance loss of trying to write this management
+   code in one's own application?
+2) Since the bstring library source code is given, a sufficiently powerful
+   modern inlining globally optimizing compiler can remove function call
+   overhead.
+
+Since the data type is exposed, a developer can replace any unsatisfactory
+function with their own inline implementation.  And that is besides the main
+point of what the better string library is mainly meant to provide.  Any
+overhead lost has to be compared against the value of the safe abstraction
+for coupling memory management and string functionality.
+
+Performance of the C interface:
+...............................
+
+The algorithms used have performance advantages versus the analogous C
+library functions.  For example:
+
+1. bfromcstr/blk2str/bstrcpy versus strcpy/strdup.  By using memmove instead
+   of strcpy, the break condition of the copy loop is based on an independent
+   counter (that should be allocated in a register) rather than having to
+   check the results of the load.  Modern out-of-order executing CPUs can
+   parallelize the final branch mis-predict penality with the loading of the
+   source string.  Some CPUs will also tend to have better built-in hardware
+   support for counted memory moves than load-compare-store.  (This is a
+   minor, but non-zero gain.)
+2. biseq versus strcmp.  If the strings are unequal in length, bsiseq will
+   return in O(1) time.  If the strings are aliased, or have aliased data
+   buffers, biseq will return in O(1) time.  strcmp will always be O(k),
+   where k is the length of the common prefix or the whole string if they are
+   identical.
+3. ->slen versus strlen.  ->slen is obviously always O(1), while strlen is
+   always O(n) where n is the length of the string.
+4. bconcat versus strcat.  Both rely on precomputing the length of the
+   destination string argument, which will favor the bstring library.  On
+   iterated concatenations the performance difference can be enormous.
+5. bsreadln versus fgets.  The bsreadln function reads large blocks at a time
+   from the given stream, then parses out lines from the buffers directly.
+   Some C libraries will implement fgets as a loop over single fgetc calls.
+   Testing indicates that the bsreadln approach can be several times faster
+   for fast stream devices (such as a file that has been entirely cached.)
+6. bsplits/bsplitscb versus strspn.  Accelerators for the set of match
+   characters are generated only once.
+7. binstr versus strstr.  The binstr implementation unrolls the loops to
+   help reduce loop overhead.  This will matter if the target string is
+   long and source string is not found very early in the target string.
+   With strstr, while it is possible to unroll the source contents, it is
+   not possible to do so with the destination contents in a way that is
+   effective because every destination character must be tested against
+   '\0' before proceeding to the next character.
+8. bReverse versus strrev.  The C function must find the end of the string
+   first before swaping character pairs.
+9. bstrrchr versus no comparable C function.  Its not hard to write some C
+   code to search for a character from the end going backwards.  But there
+   is no way to do this without computing the length of the string with
+   strlen.
+
+Practical testing indicates that in general Bstrlib is never signifcantly
+slower than the C library for common operations, while very often having a
+performance advantage that ranges from significant to massive.  Even for
+functions like b(n)inchr versus str(c)spn() (where, in theory, there is no
+advantage for the Bstrlib architecture) the performance of Bstrlib is vastly
+superior to most tested C library implementations.
+
+Some of Bstrlib's extra functionality also lead to inevitable performance
+advantages over typical C solutions.  For example, using the blk2tbstr macro,
+one can (in O(1) time) generate an internal substring by reference while not
+disturbing the original string.  If disturbing the original string is not an
+option, typically, a comparable char * solution would have to make a copy of
+the substring to provide similar functionality.  Another example is reverse
+character set scanning -- the str(c)spn functions only scan in a forward
+direction which can complicate some parsing algorithms.
+
+Where high performance char * based algorithms are available, Bstrlib can
+still leverage them by accessing the ->data field on bstrings.  So
+realistically Bstrlib can never be significantly slower than any standard
+'\0' terminated char * based solutions.
+
+Performance of the C++ interface:
+.................................
+
+The C++ interface has been designed with an emphasis on abstraction and safety
+first.  However, since it is substantially a wrapper for the C bstring
+functions, for longer strings the performance comments described in the
+"Performance of the C interface" section above still apply. Note that the
+(CBString *) type can be directly cast to a (bstring) type, and passed as
+parameters to the C functions (though a CBString must never be passed to
+bdestroy.)
+
+Probably the most controversial choice is performing full bounds checking on
+the [] operator.  This decision was made because 1) the fast alternative of
+not bounds checking is still available by first casting the CBString to a
+(const char *) buffer or to a (struct tagbstring) then derefencing .data and
+2) because the lack of bounds checking is seen as one of the main weaknesses
+of C/C++ versus other languages.  This check being done on every access leads
+to individual character extraction being actually slower than other languages
+in this one respect (other language's compilers will normally dedicate more
+resources on hoisting or removing bounds checking as necessary) but otherwise
+bring C++ up to the level of other languages in terms of functionality.
+
+It is common for other C++ libraries to leverage the abstractions provided by
+C++ to use reference counting and "copy on write" policies.  While these
+techniques can speed up some scenarios, they impose a problem with respect to
+thread safety.  bstrings and CBStrings can be properly protected with
+"per-object" mutexes, meaning that two bstrlib calls can be made and execute
+simultaneously, so long as the bstrings and CBstrings are distinct.  With a
+reference count and alias before copy on write policy, global mutexes are
+required that prevent multiple calls to the strings library to execute
+simultaneously regardless of whether or not the strings represent the same
+string.
+
+One interesting trade off in CBString is that the default constructor is not
+trivial.  I.e., it always prepares a ready to use memory buffer.  The purpose
+is to ensure that there is a uniform internal composition for any functioning
+CBString that is compatible with bstrings.  It also means that the other
+methods in the class are not forced to perform "late initialization" checks.
+In the end it means that construction of CBStrings are slower than other
+comparable C++ string classes.  Initial testing, however, indicates that
+CBString outperforms std::string and MFC's CString, for example, in all other
+operations.  So to work around this weakness it is recommended that CBString
+declarations be pushed outside of inner loops.
+
+Practical testing indicates that with the exception of the caveats given
+above (constructors and safe index character manipulations) the C++ API for
+Bstrlib generally outperforms popular standard C++ string classes.  Amongst
+the standard libraries and compilers, the quality of concatenation operations
+varies wildly and very little care has gone into search functions.  Bstrlib
+dominates those performance benchmarks.
+
+Memory management:
+..................
+
+The bstring functions which write and modify bstrings will automatically
+reallocate the backing memory for the char buffer whenever it is required to
+grow.  The algorithm for resizing chosen is to snap up to sizes that are a
+power of two which are sufficient to hold the intended new size.  Memory
+reallocation is not performed when the required size of the buffer is
+decreased.  This behavior can be relied on, and is necessary to make the
+behaviour of balloc deterministic.  This trades off additional memory usage
+for decreasing the frequency for required reallocations:
+
+1. For any bstring whose size never exceeds n, its buffer is not ever
+   reallocated more than log_2(n) times for its lifetime.
+2. For any bstring whose size never exceeds n, its buffer is never more than
+   2*(n+1) in length.  (The extra characters beyond 2*n are to allow for the
+   implicit '\0' which is always added by the bstring modifying functions.)
+
+Decreasing the buffer size when the string decreases in size would violate 1)
+above and in real world case lead to pathological heap thrashing.  Similarly,
+allocating more tightly than "least power of 2 greater than necessary" would
+lead to a violation of 1) and have the same potential for heap thrashing.
+
+Property 2) needs emphasizing.  Although the memory allocated is always a
+power of 2, for a bstring that grows linearly in size, its buffer memory also
+grows linearly, not exponentially.  The reason is that the amount of extra
+space increases with each reallocation, which decreases the frequency of
+future reallocations.
+
+Obviously, given that bstring writing functions may reallocate the data
+buffer backing the target bstring, one should not attempt to cache the data
+buffer address and use it after such bstring functions have been called.
+This includes making reference struct tagbstrings which alias to a writable
+bstring.
+
+balloc or bfromcstralloc can be used to preallocate the minimum amount of
+space used for a given bstring.  This will reduce even further the number of
+times the data portion is reallocated.  If the length of the string is never
+more than one less than the memory length then there will be no further
+reallocations.
+
+Note that invoking the bwriteallow macro may increase the number of reallocs
+by one more than necessary for every call to bwriteallow interleaved with any
+bstring API which writes to this bstring.
+
+The library does not use any mechanism for automatic clean up for the C API.
+Thus explicit clean up via calls to bdestroy() are required to avoid memory
+leaks.
+
+Constant and static tagbstrings:
+................................
+
+A struct tagbstring can be write protected from any bstrlib function using
+the bwriteprotect macro.  A write protected struct tagbstring can then be
+reset to being writable via the bwriteallow macro.  There is, of course, no
+protection from attempts to directly access the bstring members.  Modifying a
+bstring which is write protected by direct access has undefined behavior.
+
+static struct tagbstrings can be declared via the bsStatic macro.  They are
+considered permanently unwritable.  Such struct tagbstrings's are declared
+such that attempts to write to it are not well defined.  Invoking either
+bwriteallow or bwriteprotect on static struct tagbstrings has no effect.
+
+struct tagbstring's initialized via btfromcstr or blk2tbstr are protected by
+default but can be made writeable via the bwriteallow macro.  If bwriteallow
+is called on such struct tagbstring's, it is the programmer's responsibility
+to ensure that:
+
+1) the buffer supplied was allocated from the heap.
+2) bdestroy is not called on this tagbstring (unless the header itself has
+   also been allocated from the heap.)
+3) free is called on the buffer to reclaim its memory.
+
+bwriteallow and bwriteprotect can be invoked on ordinary bstrings (they have
+to be dereferenced with the (*) operator to get the levels of indirection
+correct) to give them write protection.
+
+Buffer declaration:
+...................
+
+The memory buffer is actually declared "unsigned char *" instead of "char *".
+The reason for this is to trigger compiler warnings whenever uncasted char
+buffers are assigned to the data portion of a bstring.  This will draw more
+diligent programmers into taking a second look at the code where they
+have carelessly left off the typically required cast.  (Research from
+AT&T/Lucent indicates that additional programmer eyeballs is one of the most
+effective mechanisms at ferreting out bugs.)
+
+Function pointers:
+..................
+
+The bgets, bread and bStream functions use function pointers to obtain
+strings from data streams.  The function pointer declarations have been
+specifically chosen to be compatible with the fgetc and fread functions.
+While this may seem to be a convoluted way of implementing fgets and fread
+style functionality, it has been specifically designed this way to ensure
+that there is no dependency on a single narrowly defined set of device
+interfaces, such as just stream I/O.  In the embedded world, its quite
+possible to have environments where such interfaces may not exist in the
+standard C library form.  Furthermore, the generalization that this opens up
+allows for more sophisticated uses for these functions (performing an fgets
+like function on a socket, for example.) By using function pointers, it also
+allows such abstract stream interfaces to be created using the bstring library
+itself while not creating a circular dependency.
+
+Use of int's for sizes:
+.......................
+
+This is just a recognition that 16bit platforms with requirements for strings
+that are larger than 64K and 32bit+ platforms with requirements for strings
+that are larger than 4GB are pretty marginal.  The main focus is for 32bit
+platforms, and emerging 64bit platforms with reasonable < 4GB string
+requirements.  Using ints allows for negative values which has meaning
+internally to bstrlib.
+
+Semantic consideration:
+.......................
+
+Certain care needs to be taken when copying and aliasing bstrings.  A bstring
+is essentially a pointer type which points to a multipart abstract data
+structure.  Thus usage, and lifetime of bstrings have semantics that follow
+these considerations.  For example:
+
+    bstring a, b;
+    struct tagbstring t;
+
+    a = bfromcstr("Hello"); /* Create new bstring and copy "Hello" into it. */
+    b = a;                  /* Alias b to the contents of a.                */
+    t = *a;                 /* Create a current instance pseudo-alias of a. */
+    bconcat (a, b);         /* Double a and b, t is now undefined.          */
+    bdestroy (a);           /* Destroy the contents of both a and b.        */
+
+Variables of type bstring are really just references that point to real
+bstring objects.  The equal operator (=) creates aliases, and the asterisk
+dereference operator (*) creates a kind of alias to the current instance (which
+is generally not useful for any purpose.)  Using bstrcpy() is the correct way
+of creating duplicate instances.  The ampersand operator (&) is useful for
+creating aliases to struct tagbstrings (remembering that constructed struct
+tagbstrings are not writable by default.)
+
+CBStrings use complete copy semantics for the equal operator (=), and thus do
+not have these sorts of issues.
+
+Debugging:
+..........
+
+Bstrings have a simple, exposed definition and construction, and the library
+itself is open source.  So most debugging is going to be fairly straight-
+forward.  But the memory for bstrings come from the heap, which can often be
+corrupted indirectly, and it might not be obvious what has happened even from
+direct examination of the contents in a debugger or a core dump.  There are
+some tools such as Purify, Insure++ and Electric Fence which can help solve
+such problems, however another common approach is to directly instrument the
+calls to malloc, realloc, calloc, free, memcpy, memmove and/or other calls
+by overriding them with macro definitions.
+
+Although the user could hack on the Bstrlib sources directly as necessary to
+perform such an instrumentation, Bstrlib comes with a built-in mechanism for
+doing this.  By defining the macro BSTRLIB_MEMORY_DEBUG and providing an
+include file named memdbg.h this will force the core Bstrlib modules to
+attempt to include this file.  In such a file, macros could be defined which
+overrides Bstrlib's useage of the C standard library.
+
+Rather than calling malloc, realloc, free, memcpy or memmove directly, Bstrlib
+emits the macros bstr__alloc, bstr__realloc, bstr__free, bstr__memcpy and
+bstr__memmove in their place respectively.  By default these macros are simply
+assigned to be equivalent to their corresponding C standard library function
+call.  However, if they are given earlier macro definitions (via the back
+door include file) they will not be given their default definition.  In this
+way Bstrlib's interface to the standard library can be changed but without
+having to directly redefine or link standard library symbols (both of which
+are not strictly ANSI C compliant.)
+
+An example definition might include:
+
+    #define bstr__alloc(sz) X_malloc ((sz), __LINE__, __FILE__)
+
+which might help contextualize heap entries in a debugging environment.
+
+The NULL parameter and sanity checking of bstrings is part of the Bstrlib
+API, and thus Bstrlib itself does not present any different modes which would
+correspond to "Debug" or "Release" modes.  Bstrlib always contains mechanisms
+which one might think of as debugging features, but retains the performance
+and small memory footprint one would normally associate with release mode
+code.
+
+Integration Microsoft's Visual Studio debugger:
+...............................................
+
+Microsoft's Visual Studio debugger has a capability of customizable mouse
+float over data type descriptions.  This is accomplished by editting the
+AUTOEXP.DAT file to include the following:
+
+    ; new for CBString
+    tagbstring =slen=<slen> mlen=<mlen> <data,st>
+    Bstrlib::CBStringList =count=<size()>
+
+In Visual C++ 6.0 this file is located in the directory:
+
+    C:\Program Files\Microsoft Visual Studio\Common\MSDev98\Bin
+
+and in Visual Studio .NET 2003 its located here:
+
+    C:\Program Files\Microsoft Visual Studio .NET 2003\Common7\Packages\Debugger
+
+This will improve the ability of debugging with Bstrlib under Visual Studio.
+
+Security
+--------
+
+Bstrlib does not come with explicit security features outside of its fairly
+comprehensive error detection, coupled with its strict semantic support.
+That is to say that certain common security problems, such as buffer overrun,
+constant overwrite, arbitrary truncation etc, are far less likely to happen
+inadvertently.  Where it does help, Bstrlib maximizes its advantage by
+providing developers a simple adoption path that lets them leave less secure
+string mechanisms behind.  The library will not leave developers wanting, so
+they will be less likely to add new code using a less secure string library
+to add functionality that might be missing from Bstrlib.
+
+That said there are a number of security ideas not addressed by Bstrlib:
+
+1. Race condition exploitation (i.e., verifying a string's contents, then
+raising the privilege level and execute it as a shell command as two
+non-atomic steps) is well beyond the scope of what Bstrlib can provide.  It
+should be noted that MFC's built-in string mutex actually does not solve this
+problem either -- it just removes immediate data corruption as a possible
+outcome of such exploit attempts (it can be argued that this is worse, since
+it will leave no trace of the exploitation).  In general race conditions have
+to be dealt with by careful design and implementation; it cannot be assisted
+by a string library.
+
+2. Any kind of access control or security attributes to prevent usage in
+dangerous interfaces such as system().  Perl includes a "trust" attribute
+which can be endowed upon strings that are intended to be passed to such
+dangerous interfaces.  However, Perl's solution reflects its own limitations
+-- notably that it is not a strongly typed language.  In the example code for
+Bstrlib, there is a module called taint.cpp.  It demonstrates how to write a
+simple wrapper class for managing "untainted" or trusted strings using the
+type system to prevent questionable mixing of ordinary untrusted strings with
+untainted ones then passing them to dangerous interfaces.  In this way the
+security correctness of the code reduces to auditing the direct usages of
+dangerous interfaces or promotions of tainted strings to untainted ones.
+
+3. Encryption of string contents is way beyond the scope of Bstrlib.
+Maintaining encrypted string contents in the futile hopes of thwarting things
+like using system-level debuggers to examine sensitive string data is likely
+to be a wasted effort (imagine a debugger that runs at a higher level than a
+virtual processor where the application runs).  For more standard encryption
+usages, since the bstring contents are simply binary blocks of data, this
+should pose no problem for usage with other standard encryption libraries.
+
+Compatibility
+-------------
+
+The Better String Library is known to compile and function correctly with the
+following compilers:
+
+  - Microsoft Visual C++
+  - Watcom C/C++
+  - Intel's C/C++ compiler (Windows)
+  - The GNU C/C++ compiler (cygwin and Linux on PPC64)
+  - Borland C
+  - Turbo C
+
+Setting of configuration options should be unnecessary for these compilers
+(unless exceptions are being disabled or STLport has been added to WATCOM
+C/C++).  Bstrlib has been developed with an emphasis on portability.  As such
+porting it to other compilers should be straight forward.  This package
+includes a porting guide (called porting.txt) which explains what issues may
+exist for porting Bstrlib to different compilers and environments.
+
+ANSI issues
+-----------
+
+1. The function pointer types bNgetc and bNread have prototypes which are very
+similar to, but not exactly the same as fgetc and fread respectively.
+Basically the FILE * parameter is replaced by void *.  The purpose of this
+was to allow one to create other functions with fgetc and fread like
+semantics without being tied to ANSI C's file streaming mechanism.  I.e., one
+could very easily adapt it to sockets, or simply reading a block of memory,
+or procedurally generated strings (for fractal generation, for example.)
+
+The problem is that invoking the functions (bNgetc)fgetc and (bNread)fread is
+not technically legal in ANSI C.  The reason being that the compiler is only
+able to coerce the function pointers themselves into the target type, however
+are unable to perform any cast (implicit or otherwise) on the parameters
+passed once invoked.  I.e., if internally void * and FILE * need some kind of
+mechanical coercion, the compiler will not properly perform this conversion
+and thus lead to undefined behavior.
+
+Apparently a platform from Data General called "Eclipse" and another from
+Tandem called "NonStop" have a different representation for pointers to bytes
+and pointers to words, for example, where coercion via casting is necessary.
+(Actual confirmation of the existence of such machines is hard to come by, so
+it is prudent to be skeptical about this information.)  However, this is not
+an issue for any known contemporary platforms.  One may conclude that such
+platforms are effectively apocryphal even if they do exist.
+
+To correctly work around this problem to the satisfaction of the ANSI
+limitations, one needs to create wrapper functions for fgets and/or
+fread with the prototypes of bNgetc and/or bNread respectively which performs
+no other action other than to explicitely cast the void * parameter to a
+FILE *, and simply pass the remaining parameters straight to the function
+pointer call.
+
+The wrappers themselves are trivial:
+
+    size_t freadWrap (void * buff, size_t esz, size_t eqty, void * parm) {
+        return fread (buff, esz, eqty, (FILE *) parm);
+    }
+
+    int fgetcWrap (void * parm) {
+        return fgetc ((FILE *) parm);
+    }
+
+These have not been supplied in bstrlib or bstraux to prevent unnecessary
+linking with file I/O functions.
+
+2. vsnprintf is not available on all compilers.  Because of this, the bformat
+and bformata functions (and format and formata methods) are not guaranteed to
+work properly.  For those compilers that don't have vsnprintf, the
+BSTRLIB_NOVSNP macro should be set before compiling bstrlib, and the format
+functions/method will be disabled.
+
+The more recent ANSI C standards have specified the required inclusion of a
+vsnprintf function.
+
+3. The bstrlib function names are not unique in the first 6 characters.  This
+is only an issue for older C compiler environments which do not store more
+than 6 characters for function names.
+
+4. The bsafe module defines macros and function names which are part of the
+C library.  This simply overrides the definition as expected on all platforms
+tested, however it is not sanctioned by the ANSI standard.  This module is
+clearly optional and should be omitted on platforms which disallow its
+undefined semantics.
+
+In practice the real issue is that some compilers in some modes of operation
+can/will inline these standard library functions on a module by module basis
+as they appear in each.  The linker will thus have no opportunity to override
+the implementation of these functions for those cases.  This can lead to
+inconsistent behaviour of the bsafe module on different platforms and
+compilers.
+
+===============================================================================
+
+Comparison with Microsoft's CString class
+-----------------------------------------
+
+Although developed independently, CBStrings have very similar functionality to
+Microsoft's CString class.  However, the bstring library has significant
+advantages over CString:
+
+1. Bstrlib is a C-library as well as a C++ library (using the C++ wrapper).
+
+    - Thus it is compatible with more programming environments and
+      available to a wider population of programmers.
+
+2. The internal structure of a bstring is considered exposed.
+
+    - A single contiguous block of data can be cut into read-only pieces by
+      simply creating headers, without allocating additional memory to create
+      reference copies of each of these sub-strings.
+    - In this way, using bstrings in a totally abstracted way becomes a choice
+      rather than an imposition.  Further this choice can be made differently
+      at different layers of applications that use it.
+
+3. Static declaration support precludes the need for constructor
+   invocation.
+
+    - Allows for static declarations of constant strings that has no
+      additional constructor overhead.
+
+4. Bstrlib is not attached to another library.
+
+    - Bstrlib is designed to be easily plugged into any other library
+      collection, without dependencies on other libraries or paradigms (such
+      as "MFC".)
+
+The bstring library also comes with a few additional functions that are not
+available in the CString class:
+
+    - bsetstr
+    - bsplit
+    - bread
+    - breplace (this is different from CString::Replace())
+    - Writable indexed characters (for example a[i]='x')
+
+Interestingly, although Microsoft did implement mid$(), left$() and right$()
+functional analogues (these are functions from GWBASIC) they seem to have
+forgotten that mid$() could be also used to write into the middle of a string.
+This functionality exists in Bstrlib with the bsetstr() and breplace()
+functions.
+
+Among the disadvantages of Bstrlib is that there is no special support for
+localization or wide characters.  Such things are considered beyond the scope
+of what bstrings are trying to deliver.  CString essentially supports the
+older UCS-2 version of Unicode via widechar_t as an application-wide compile
+time switch.
+
+CString's also use built-in mechanisms for ensuring thread safety under all
+situations.  While this makes writing thread safe code that much easier, this
+built-in safety feature has a price -- the inner loops of each CString method
+runs in its own critical section (grabbing and releasing a light weight mutex
+on every operation.)  The usual way to decrease the impact of a critical
+section performance penalty is to amortize more operations per critical
+section.  But since the implementation of CStrings is fixed as a one critical
+section per-operation cost, there is no way to leverage this common
+performance enhancing idea.
+
+The search facilities in Bstrlib are comparable to those in MFC's CString
+class, though it is missing locale specific collation.  But because Bstrlib
+is interoperable with C's char buffers, it will allow programmers to write
+their own string searching mechanism (such as Boyer-Moore), or be able to
+choose from a variety of available existing string searching libraries (such
+as those for regular expressions) without difficulty.
+
+Microsoft used a very non-ANSI conforming trick in its implementation to
+allow printf() to use the "%s" specifier to output a CString correctly.  This
+can be convenient, but it is inherently not portable.  CBString requires an
+explicit cast, while bstring requires the data member to be dereferenced.
+Microsoft's own documentation recommends casting, instead of relying on this
+feature.
+
+Comparison with C++'s std::string
+---------------------------------
+
+This is the C++ language's standard STL based string class.
+
+1. There is no C implementation.
+2. The [] operator is not bounds checked.
+3. Missing a lot of useful functions like printf-like formatting.
+4. Some sub-standard std::string implementations (SGI) are necessarily unsafe
+   to use with multithreading.
+5. Limited by STL's std::iostream which in turn is limited by ifstream which
+   can only take input from files.  (Compare to CBStream's API which can take
+   abstracted input.)
+6. Extremely uneven performance across implementations.
+
+Comparison with ISO C TR 24731 proposal
+---------------------------------------
+
+Following the ISO C99 standard, Microsoft has proposed a group of C library
+extensions which are supposedly "safer and more secure".  This proposal is
+expected to be adopted by the ISO C standard which follows C99.
+
+The proposal reveals itself to be very similar to Microsoft's "StrSafe"
+library. The functions are basically the same as other standard C library
+string functions except that destination parameters are paired with an
+additional length parameter of type rsize_t.  rsize_t is the same as size_t,
+however, the range is checked to make sure its between 1 and RSIZE_MAX.  Like
+Bstrlib, the functions perform a "parameter check".  Unlike Bstrlib, when a
+parameter check fails, rather than simply outputing accumulatable error
+statuses, they call a user settable global error function handler, and upon
+return of control performs no (additional) detrimental action.  The proposal
+covers basic string functions as well as a few non-reenterable functions
+(asctime, ctime, and strtok).
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+   is still O(n), and there are no faster streq() comparison functions.)
+2. No growable string semantics.
+3. Requires manual buffer length synchronization in the source code.
+4. No attempt to enhance functionality of the C library.
+5. Introduces a new error scenario (strings exceeding RSIZE_MAX length).
+
+The hope is that by exposing the buffer length requirements there will be
+fewer buffer overrun errors.  However, the error modes are really just
+transformed, rather than removed.  The real problem of buffer overflows is
+that they all happen as a result of erroneous programming.  So forcing
+programmers to manually deal with buffer limits, will make them more aware of
+the problem but doesn't remove the possibility of erroneous programming.  So
+a programmer that erroneously mixes up the rsize_t parameters is no better off
+from a programmer that introduces potential buffer overflows through other
+more typical lapses.  So at best this may reduce the rate of erroneous
+programming, rather than making any attempt at removing failure modes.
+
+The error handler can discriminate between types of failures, but does not
+take into account any callsite context.  So the problem is that the error is
+going to be manifest in a piece of code, but there is no pointer to that
+code.  It would seem that passing in the call site __FILE__, __LINE__ as
+parameters would be very useful, but the API clearly doesn't support such a
+thing (it would increase code bloat even more than the extra length
+parameter does, and would require macro tricks to implement).
+
+The Bstrlib C API takes the position that error handling needs to be done at
+the callsite, and just tries to make it as painless as possible.  Furthermore,
+error modes are removed by supporting auto-growing strings and aliasing.  For
+capturing errors in more central code fragments, Bstrlib's C++ API uses
+exception handling extensively, which is superior to the leaf-only error
+handler approach.
+
+Comparison with Managed String Library CERT proposal
+----------------------------------------------------
+
+The main webpage for the managed string library:
+http://www.cert.org/secure-coding/managedstring.html
+
+Robert Seacord at CERT has proposed a C string library that he calls the
+"Managed String Library" for C. Like Bstrlib, it introduces a new type
+which is called a managed string. The structure of a managed string
+(string_m) is like a struct tagbstring but missing the length field.  This
+internal structure is considered opaque. The length is, like the C standard
+library, always computed on the fly by searching for a terminating NUL on
+every operation that requires it. So it suffers from every performance
+problem that the C standard library suffers from. Interoperating with C
+string APIs (like printf, fopen, or anything else that takes a string
+parameter) requires copying to additionally allocating buffers that have to
+be manually freed -- this makes this library probably slower and more
+cumbersome than any other string library in existence.
+
+The library gives a fully populated error status as the return value of every
+string function.  The hope is to be able to diagnose all problems
+specifically from the return code alone.  Comparing this to Bstrlib, which
+aways returns one consistent error message, might make it seem that Bstrlib
+would be harder to debug; but this is not true.  With Bstrlib, if an error
+occurs there is always enough information from just knowing there was an error
+and examining the parameters to deduce exactly what kind of error has
+happened.  The managed string library thus gives up nested function calls
+while achieving little benefit, while Bstrlib does not.
+
+One interesting feature that "managed strings" has is the idea of data
+sanitization via character set whitelisting.  That is to say, a globally
+definable filter that makes any attempt to put invalid characters into strings
+lead to an error and not modify the string.  The author gives the following
+example:
+
+    // create valid char set
+    if (retValue = strcreate_m(&str1, "abc") ) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+    if (retValue = setcharset(str1)) {
+      fprintf(
+        stderr,
+        "Error %d from  setcharset().\n",
+        retValue
+      );
+    }
+    if (retValue = strcreate_m(&str1, "aabbccabc")) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+    // create string with invalid char set
+    if (retValue = strcreate_m(&str1, "abbccdabc")) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+
+Which we can compare with a more Bstrlib way of doing things:
+
+    bstring bCreateWithFilter (const char * cstr, const_bstring filter) {
+      bstring b = bfromcstr (cstr);
+      if (BSTR_ERR != bninchr (b, filter) && NULL != b) {
+        fprintf (stderr, "Filter violation.\n");
+        bdestroy (b);
+        b = NULL;
+      }
+      return b;
+    }
+
+    struct tagbstring charFilter = bsStatic ("abc");
+    bstring str1 = bCreateWithFilter ("aabbccabc", &charFilter);
+    bstring str2 = bCreateWithFilter ("aabbccdabc", &charFilter);
+
+The first thing we should notice is that with the Bstrlib approach you can
+have different filters for different strings if necessary.  Furthermore,
+selecting a charset filter in the Managed String Library is uni-contextual.
+That is to say, there can only be one such filter active for the entire
+program, which means its usage is not well defined for intermediate library
+usage (a library that uses it will interfere with user code that uses it, and
+vice versa.)  It is also likely to be poorly defined in multi-threading
+environments.
+
+There is also a question as to whether the data sanitization filter is checked
+on every operation, or just on creation operations.  Since the charset can be
+set arbitrarily at run time, it might be set *after* some managed strings have
+been created.  This would seem to imply that all functions should run this
+additional check every time if there is an attempt to enforce this.  This
+would make things tremendously slow.  On the other hand, if it is assumed that
+only creates and other operations that take char *'s as input need be checked
+because the charset was only supposed to be called once at and before any
+other managed string was created, then one can see that its easy to cover
+Bstrlib with equivalent functionality via a few wrapper calls such as the
+example given above.
+
+And finally we have to question the value of sanitation in the first place.
+For example, for httpd servers, there is generally a requirement that the
+URLs parsed have some form that avoids undesirable translation to local file
+system filenames or resources.  The problem is that the way URLs can be
+encoded, it must be completely parsed and translated to know if it is using
+certain invalid character combinations.  That is to say, merely filtering
+each character one at a time is not necessarily the right way to ensure that
+a string has safe contents.
+
+In the article that describes this proposal, it is claimed that it fairly
+closely approximates the existing C API semantics.  On this point we should
+compare this "closeness" with Bstrlib:
+
+                      Bstrlib                     Managed String Library
+                      -------                     ----------------------
+
+Pointer arithmetic    Segment arithmetic          N/A
+
+Use in C Std lib      ->data, or bdata{e}         getstr_m(x,*) ... free(x)
+
+String literals       bsStatic, bsStaticBlk       strcreate_m()
+
+Transparency          Complete                    None
+
+Its pretty clear that the semantic mapping from C strings to Bstrlib is fairly
+straightforward, and that in general semantic capabilities are the same or
+superior in Bstrlib.  On the other hand the Managed String Library is either
+missing semantics or changes things fairly significantly.
+
+Comparison with Annexia's c2lib library
+---------------------------------------
+
+This library is available at:
+http://www.annexia.org/freeware/c2lib
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+   is still O(n), and there are no faster streq() comparison functions.)
+   Their suggestion that alternatives which wrap the string data type (such as
+   bstring does) imposes a difficulty in interoperating with the C langauge's
+   ordinary C string library is not founded.
+2. Introduction of memory (and vector?) abstractions imposes a learning
+   curve, and some kind of memory usage policy that is outside of the strings
+   themselves (and therefore must be maintained by the developer.)
+3. The API is massive, and filled with all sorts of trivial (pjoin) and
+   controvertial (pmatch -- regular expression are not sufficiently
+   standardized, and there is a very large difference in performance between
+   compiled and non-compiled, REs) functions.  Bstrlib takes a decidely
+   minimal approach -- none of the functionality in c2lib is difficult or
+   challenging to implement on top of Bstrlib (except the regex stuff, which
+   is going to be difficult, and controvertial no matter what.)
+4. Understanding why c2lib is the way it is pretty much requires a working
+   knowledge of Perl.  bstrlib requires only knowledge of the C string library
+   while providing just a very select few worthwhile extras.
+5. It is attached to a lot of cruft like a matrix math library (that doesn't
+   include any functions for getting the determinant, eigenvectors,
+   eigenvalues, the matrix inverse, test for singularity, test for
+   orthogonality, a grahm schmit orthogonlization, LU decomposition ... I
+   mean why bother?)
+
+Convincing a development house to use c2lib is likely quite difficult.  It
+introduces too much, while not being part of any kind of standards body.  The
+code must therefore be trusted, or maintained by those that use it.  While
+bstring offers nothing more on this front, since its so much smaller, covers
+far less in terms of scope, and will typically improve string performance,
+the barrier to usage should be much smaller.
+
+Comparison with stralloc/qmail
+------------------------------
+
+More information about this library can be found here:
+http://www.canonical.org/~kragen/stralloc.html or here:
+http://cr.yp.to/lib/stralloc.html
+
+1. Library is very very minimal.  A little too minimal.
+2. Untargetted source parameters are not declared const.
+3. Slightly different expected emphasis (like _cats function which takes an
+   ordinary C string char buffer as a parameter.)  Its clear that the
+   remainder of the C string library is still required to perform more
+   useful string operations.
+
+The struct declaration for their string header is essentially the same as that
+for bstring.  But its clear that this was a quickly written hack whose goals
+are clearly a subset of what Bstrlib supplies.  For anyone who is served by
+stralloc, Bstrlib is complete substitute that just adds more functionality.
+
+stralloc actually uses the interesting policy that a NULL data pointer
+indicates an empty string.  In this way, non-static empty strings can be
+declared without construction.  This advantage is minimal, since static empty
+bstrings can be declared inline without construction, and if the string needs
+to be written to it should be constructed from an empty string (or its first
+initializer) in any event.
+
+wxString class
+--------------
+
+This is the string class used in the wxWindows project.  A description of
+wxString can be found here:
+http://www.wxwindows.org/manuals/2.4.2/wx368.htm#wxstring
+
+This C++ library is similar to CBString.  However, it is littered with
+trivial functions (IsAscii, UpperCase, RemoveLast etc.)
+
+1. There is no C implementation.
+2. The memory management strategy is to allocate a bounded fixed amount of
+   additional space on each resize, meaning that it does not have the
+   log_2(n) property that Bstrlib has (it will thrash very easily, cause
+   massive fragmentation in common heap implementations, and can easily be a
+   common source of performance problems).
+3. The library uses a "copy on write" strategy, meaning that it has to deal
+   with multithreading problems.
+
+Vstr
+----
+
+This is a highly orthogonal C string library with an emphasis on
+networking/realtime programming.  It can be found here:
+http://www.and.org/vstr/
+
+1. The convoluted internal structure does not contain a '\0' char * compatible
+   buffer, so interoperability with the C library a non-starter.
+2. The API and implementation is very large (owing to its orthogonality) and
+   can lead to difficulty in understanding its exact functionality.
+3. An obvious dependency on gnu tools (confusing make configure step)
+4. Uses a reference counting system, meaning that it is not likely to be
+   thread safe.
+
+The implementation has an extreme emphasis on performance for nontrivial
+actions (adds, inserts and deletes are all constant or roughly O(#operations)
+time) following the "zero copy" principle.  This trades off performance of
+trivial functions (character access, char buffer access/coersion, alias
+detection) which becomes significantly slower, as well as incremental
+accumulative costs for its searching/parsing functions.  Whether or not Vstr
+wins any particular performance benchmark will depend a lot on the benchmark,
+but it should handily win on some, while losing dreadfully on others.
+
+The learning curve for Vstr is very steep, and it doesn't come with any
+obvious way to build for Windows or other platforms without gnu tools.  At
+least one mechanism (the iterator) introduces a new undefined scenario
+(writing to a Vstr while iterating through it.)  Vstr has a very large
+footprint, and is very ambitious in its total functionality.  Vstr has no C++
+API.
+
+Vstr usage requires context initialization via vstr_init() which must be run
+in a thread-local context.  Given the totally reference based architecture
+this means that sharing Vstrings across threads is not well defined, or at
+least not safe from race conditions.  This API is clearly geared to the older
+standard of fork() style multitasking in UNIX, and is not safely transportable
+to modern shared memory multithreading available in Linux and Windows.  There
+is no portable external solution making the library thread safe (since it
+requires a mutex around each Vstr context -- not each string.)
+
+In the documentation for this library, a big deal is made of its self hosted
+s(n)printf-like function.  This is an issue for older compilers that don't
+include vsnprintf(), but also an issue because Vstr has a slow conversion to
+'\0' terminated char * mechanism.  That is to say, using "%s" to format data
+that originates from Vstr would be slow without some sort of native function
+to do so.  Bstrlib sidesteps the issue by relying on what snprintf-like
+functionality does exist and having a high performance conversion to a char *
+compatible string so that "%s" can be used directly.
+
+Str Library
+-----------
+
+This is a fairly extensive string library, that includes full unicode support
+and targetted at the goal of out performing MFC and STL.  The architecture,
+similarly to MFC's CStrings, is a copy on write reference counting mechanism.
+
+http://www.utilitycode.com/str/default.aspx
+
+1. Commercial.
+2. C++ only.
+
+This library, like Vstr, uses a ref counting system.  There is only so deeply
+I can analyze it, since I don't have a license for it.  However, performance
+improvements over MFC's and STL, doesn't seem like a sufficient reason to
+move your source base to it.  For example, in the future, Microsoft may
+improve the performance CString.
+
+It should be pointed out that performance testing of Bstrlib has indicated
+that its relative performance advantage versus MFC's CString and STL's
+std::string is at least as high as that for the Str library.
+
+libmib astrings
+---------------
+
+A handful of functional extensions to the C library that add dynamic string
+functionality.
+http://www.mibsoftware.com/libmib/astring/
+
+This package basically references strings through char ** pointers and assumes
+they are pointing to the top of an allocated heap entry (or NULL, in which
+case memory will be newly allocated from the heap.)  So its still up to user
+to mix and match the older C string functions with these functions whenever
+pointer arithmetic is used (i.e., there is no leveraging of the type system
+to assert semantic differences between references and base strings as Bstrlib
+does since no new types are introduced.)  Unlike Bstrlib, exact string length
+meta data is not stored, thus requiring a strlen() call on *every* string
+writing operation.  The library is very small, covering only a handful of C's
+functions.
+
+While this is better than nothing, it is clearly slower than even the
+standard C library, less safe and less functional than Bstrlib.
+
+To explain the advantage of using libmib, their website shows an example of
+how dangerous C code:
+
+    char buf[256];
+    char *pszExtraPath = ";/usr/local/bin";
+
+    strcpy(buf,getenv("PATH")); /* oops! could overrun! */
+    strcat(buf,pszExtraPath); /* Could overrun as well! */
+
+    printf("Checking...%s\n",buf); /* Some printfs overrun too! */
+
+is avoided using libmib:
+
+    char *pasz = 0;      /* Must initialize to 0 */
+    char *paszOut = 0;
+    char *pszExtraPath = ";/usr/local/bin";
+
+    if (!astrcpy(&pasz,getenv("PATH"))) /* malloc error */ exit(-1);
+    if (!astrcat(&pasz,pszExtraPath)) /* malloc error */ exit(-1);
+
+    /* Finally, a "limitless" printf! we can use */
+    asprintf(&paszOut,"Checking...%s\n",pasz);fputs(paszOut,stdout);
+
+    astrfree(&pasz); /* Can use free(pasz) also. */
+    astrfree(&paszOut);
+
+However, compare this to Bstrlib:
+
+    bstring b, out;
+
+    bcatcstr (b = bfromcstr (getenv ("PATH")), ";/usr/local/bin");
+    out = bformat ("Checking...%s\n", bdatae (b, "<Out of memory>"));
+    /* if (out && b) */ fputs (bdatae (out, "<Out of memory>"), stdout);
+    bdestroy (b);
+    bdestroy (out);
+
+Besides being shorter, we can see that error handling can be deferred right
+to the very end.  Also, unlike the above two versions, if getenv() returns
+with NULL, the Bstrlib version will not exhibit undefined behavior.
+Initialization starts with the relevant content rather than an extra
+autoinitialization step.
+
+libclc
+------
+
+An attempt to add to the standard C library with a number of common useful
+functions, including additional string functions.
+http://libclc.sourceforge.net/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+   the responsibility to guard against aliasing to the programmer.
+2. Adds no safety or memory management whatsoever.
+3. Most of the supplied string functions are completely trivial.
+
+The goals of libclc and Bstrlib are clearly quite different.
+
+fireString
+----------
+
+http://firestuff.org/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+   the responsibility to guard against aliasing to the programmer.
+2. Mixes char * and length wrapped buffers (estr) functions, doubling the API
+   size, with safety limited to only half of the functions.
+
+Firestring was originally just a wrapper of char * functionality with extra
+length parameters.  However, it has been augmented with the inclusion of the
+estr type which has similar functionality to stralloc.  But firestring does
+not nearly cover the functional scope of Bstrlib.
+
+Safe C String Library
+---------------------
+
+A library written for the purpose of increasing safety and power to C's string
+handling capabilities.
+http://www.zork.org/safestr/safestr.html
+
+1. While the safestr_* functions are safe in of themselves, interoperating
+   with char * string has dangerous unsafe modes of operation.
+2. The architecture of safestr's causes the base pointer to change.  Thus,
+   its not practical/safe to store a safestr in multiple locations if any
+   single instance can be manipulated.
+3. Dependent on an additional error handling library.
+4. Uses reference counting, meaning that it is either not thread safe or
+   slow and not portable.
+
+I think the idea of reallocating (and hence potentially changing) the base
+pointer is a serious design flaw that is fatal to this architecture.  True
+safety is obtained by having automatic handling of all common scenarios
+without creating implicit constraints on the user.
+
+Because of its automatic temporary clean up system, it cannot use "const"
+semantics on input arguments.  Interesting anomolies such as:
+
+    safestr_t s, t;
+    s = safestr_replace (t = SAFESTR_TEMP ("This is a test"),
+                         SAFESTR_TEMP (" "), SAFESTR_TEMP ("."));
+    /* t is now undefined. */
+
+are possible.  If one defines a function which takes a safestr_t as a
+parameter, then the function would not know whether or not the safestr_t is
+defined after it passes it to a safestr library function.  The author
+recommended method for working around this problem is to examine the
+attributes of the safestr_t within the function which is to modify any of
+its parameters and play games with its reference count.  I think, therefore,
+that the whole SAFESTR_TEMP idea is also fatally broken.
+
+The library implements immutability, optional non-resizability, and a "trust"
+flag.  This trust flag is interesting, and suggests that applying any
+arbitrary sequence of safestr_* function calls on any set of trusted strings
+will result in a trusted string.  It seems to me, however, that if one wanted
+to implement a trusted string semantic, one might do so by actually creating
+a different *type* and only implement the subset of string functions that are
+deemed safe (i.e., user input would be excluded, for example.)  This, in
+essence, would allow the compiler to enforce trust propogation at compile
+time rather than run time.  Non-resizability is also interesting, however,
+it seems marginal (i.e., to want a string that cannot be resized, yet can be
+modified and yet where a fixed sized buffer is undesirable.)
+
+===============================================================================
+
+Examples
+--------
+
+    Dumping a line numbered file:
+
+    FILE * fp;
+    int i, ret;
+    struct bstrList * lines;
+    struct tagbstring prefix = bsStatic ("-> ");
+
+    if (NULL != (fp = fopen ("bstrlib.txt", "rb"))) {
+        bstring b = bread ((bNread) fread, fp);
+        fclose (fp);
+        if (NULL != (lines = bsplit (b, '\n'))) {
+            for (i=0; i < lines->qty; i++) {
+                binsert (lines->entry[i], 0, &prefix, '?');
+                printf ("%04d: %s\n", i, bdatae (lines->entry[i], "NULL"));
+            }
+            bstrListDestroy (lines);
+        }
+        bdestroy (b);
+    }
+
+For numerous other examples, see bstraux.c, bstraux.h and the example archive.
+
+===============================================================================
+
+License
+-------
+
+The Better String Library is available under either the BSD license (see the
+accompanying license.txt) or the Gnu Public License version 2 (see the
+accompanying gpl.txt) at the option of the user.
+
+===============================================================================
+
+Acknowledgements
+----------------
+
+The following individuals have made significant contributions to the design
+and testing of the Better String Library:
+
+Bjorn Augestad
+Clint Olsen
+Darryl Bleau
+Fabian Cenedese
+Graham Wideman
+Ignacio Burgueno
+International Business Machines Corporation
+Ira Mica
+John Kortink
+Manuel Woelker
+Marcel van Kervinck
+Michael Hsieh
+Richard A. Smith
+Simon Ekstrom
+Wayne Scott
+
+===============================================================================
diff --git a/doc/likwid-accessD.1 b/doc/likwid-accessD.1
index 7d444af..7285772 100644
--- a/doc/likwid-accessD.1
+++ b/doc/likwid-accessD.1
@@ -1,7 +1,7 @@
 .TH LIKWID-ACCESSD 1 <DATE> likwid\-<VERSION>
 .SH NAME
 likwid-accessD \- This tool forwards the access operations from LIKWID PerfMon tools
-to the MSR device files
+to the MSR  and PCI device files
 .SH DESCRIPTION
 .B likwid-accessD
 is a command line application that opens a UNIX file socket and waits for access
@@ -9,14 +9,14 @@ operations from LIKWID tools that require access to the MSR and PCI device
 files. The MSR and PCI device files are only accessible for users with root
 privileges, therefore
 .B likwid-accessD
-requires the suid-bit set.
+requires the suid-bit set or a suitable libcap setting.
 Depending on the current system architecture,
 .B likwid-accessD
 permits only access to registers defined for the architecture.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-powermeter(1), likwid-features(1), likwid-pin(1), likwid-topology(1),
+likwid-perfctr(1), likwid-powermeter(1), likwid-features(1)
diff --git a/doc/likwid-agent.1 b/doc/likwid-agent.1
new file mode 100644
index 0000000..f50dbca
--- /dev/null
+++ b/doc/likwid-agent.1
@@ -0,0 +1,94 @@
+.TH LIKWID-AGENT 1 <DATE> likwid\-VERSION
+.SH NAME
+likwid-agent \- monitoring daemon for hardware performance counters
+.SH SYNOPSIS
+.B likwid-agent <config_file>
+.SH DESCRIPTION
+.B likwid-agent
+is a daemon application that uses
+.B likwid-perfctr(1)
+to measure hardware performance counters. The basic configuration is in a global configuration file. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output to much data, the data can be further filtered or aggregated.
+.B likwid-agent
+provides multiple store backends like logfiles, RRD (Round Robin Database) or gmetric (Ganglia Monitoring System).
+
+.SH CONFIG FILE
+The global configuration file has the following options:
+.TP
+.B GROUPPATH <path>
+Path to the group files containing event set and output defintitions. See section
+.B GROUP FILES
+for information.
+.TP
+.B EVENTSET <group1> <group2> ...
+Space separated list of groups (without .txt) that should be monitored.
+.TP
+.B DURATION <time>
+Measurement duration in seconds.
+.TP
+.B LOGPATH <path>
+Specify a logfile.
+.TP
+.B GMETRIC <True/False>
+Activates the output to gmetric.
+.TP
+.B GMETRICPATH <path>
+Set path to the gmetric executable.
+.TP
+.B GMETRICCONFIG <path>
+Set a custom configuration file is needed for gmetric.
+.TP
+.B RRD <True/False>
+Activates the output to RRD files (Round Robin Database).
+.TP
+.B RRDPATH <path>
+Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.
+.TP
+.B SYSLOG <True/False>
+Activates the output to system log using logger.
+.TP
+.B SYSLOGPRIO <prio>
+Set the priority string for logger, default is 'local0.notice'.
+
+.SH GROUP FILES
+The group files are adapted performance group files as used by
+.B likwid-perfctr(1).
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is
+.B <GROUPPATH>/<SHORT_ARCH_NAME>/
+with
+.B <SHORT_ARCH_NAME>
+similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+.TP
+.B SHORT <string>
+A short descriptive information about the group.
+.TP
+.B EVENTSET
+.TP
+.B <counter1> <event1>
+.TP
+.B <counter2>:<option> <event2>
+Defintion of the eventset similar to the performance groups.
+.TP
+.B METRICS
+.TP
+.B <metricname> <formula>
+.TP
+.B <filter> <metricname> <formula>
+Defintion of the output metrics. The syntax follows the
+.B METRICS
+defintion of the performance groups as used by
+.B likwid-perfctr(1).
+If no function is set at the beginning of the line,
+.B <formula>
+is evaluated for every CPU and send to the output backends. The
+.B <metricname>
+gets the prefix "T<cpuid> ". To avoid writing to much data to the backends, the data can be reduced by
+.B <filter>.
+The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter sends only the data from the first CPU to the output backends commonly used for the measurement duration.
+
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
+.SH "SEE ALSO"
+likwid-perfctr(1), rrdtool(1), gmetric(1)
diff --git a/doc/likwid-bench.1 b/doc/likwid-bench.1
index 45d0f6c..3a1d719 100644
--- a/doc/likwid-bench.1
+++ b/doc/likwid-bench.1
@@ -5,29 +5,31 @@ likwid-bench \- low-level benchmark suite and microbenchmarking framework
 .SH SYNOPSIS
 .B likwid-bench
 .RB [\-hap]
-.RB [ \-l
-.IR <testname> ] 
-.RB [ \-i
-.IR <iterations> ]
-.RB [ \-g
-.IR <number_of_workgroups> ]
 .RB [ \-t
 .IR <testname> ]
+.RB [ \-s
+.IR <min_time> ]
 .RB [ \-w
 .IR <workgroup_expression> ]
+.RB [ \-l
+.IR <testname> ]
+.RB [ \-d
+.IR <delimiter> ]
+.RB [ \-i
+.IR <iterations> ]
 .SH DESCRIPTION
 .B likwid-bench
 is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
 .B likwid-bench
-includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by 
-.B likwid-bench 
-or measured using performance counters by using.
+includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by
+.B likwid-bench
+or measured using performance counters by using
 .B likwid-perfctr
 as a wrapper to
 .B likwid-bench.
 This requires to build
-.B likwid-bench.
-with Instrumentation which can be enabled in config.mk.
+.B likwid-bench
+with instrumentation enabled in config.mk.
 .SH OPTIONS
 .TP
 .B \-\^h
@@ -39,77 +41,130 @@ list available benchmark codes for the current system.
 .B \-\^p
 list available thread domains.
 .TP
-.B \-\^l " <testname>"
-list properties of a benchmark code.
-.TP
-.B \-\^i " <iterations>"
-number of iterations to perform inside the benchmark code.
+.B \-\^s <min_time>
+Run the benchmark for at least
+.B <min_time> seconds.
+The amount of iterations is determined using this value. Default: 1 second.
 .TP
-.B \-\^t " <testname>"
+.B \-\^t <testname>
 Name of the benchmark code to run (mandatory).
 .TP
-.B \-\^g " <number_of_workgroups>"
-specify the number of workgroups to perform the benchmark code on (mandatory).
-.TP
-.B \-\^w " <workgroup_expression>"
+.B \-\^w <workgroup_expression>
 Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory).
+.TP
+.B \-\^l <testname>
+list properties of a benchmark code.
+.TP
+.B \-\^i <iterations>
+Set the number of iterations per thread (optional)
 
 .SH WORKGROUP SYNTAX
 
 .B <thread_domain>:<size> [:<num_threads>[:<chunk_size>:<stride>]] [-<streamId>:<domain_id>]
-with size in kB, MB or GB. Where thread domain is where threads are placed. Size is the total data set size for the benchmark. num_threads specifies how many threads are used. Threads are always placed using a compact policy in
+with size in kB, MB or GB. The
+.B <thread_domain>
+defines where the threads are placed.
+.B <size>
+is the total data set size for the benchmark, the allocated vectors in memory sum up to this size.
+.B <num_threads>
+specifies how many threads are used in the
+.B <thread_domain>.
+Threads are always placed using a compact policy in
 .B likwid-bench.
 This means that per default all SMT threads are used. Optionally similar a the expression based syntax in
 .B likwid-pin
-a chunk size and stride can be provided. Optionally for every stream means array the placement can be controlled. Per default all arrays are placed in the same thread domain the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams  can be aquired by the \-l option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be ex [...]
+a
+.B <chunk_size>
+and
+.B <stride>
+can be provided. Optionally for every stream (array, vector) the placement can be controlled. Per default all arrays are placed in the same
+.B <thread_domain>
+the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams can be aquired by the
+.B \-l
+option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be explicitly placed. Please refer to the Wiki pages on
 .B http://code.google.com/p/likwid/wiki/LikwidBench
 for further details and examples on usage.
 
 
 .SH EXAMPLE
 .IP 1. 4
-Run the copy benchmark with 1000 iterations on socket 0 with a total data set size of 100kB.
+Run the
+.B copy
+benchmark on socket 0 (
+.B S0
+) with a total data set size of
+.B 100kB.
 .TP
-.B likwid-bench -t copy -i 1000 -g 1 -w S0:100kB
+.B likwid-bench -t copy -w S0:100kB
 .PP
 Since no 
-.B num_thread
-is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads.
+.B <num_threads>
+is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads and the number of iterations is determined automatically.
 .IP 2. 4
-Run the triad benchmark code with 100 iterations with 2 threads on the socket 0 and a data size of 1 GB.
+Run the
+.B triad
+benchmark code with explicitly
+.B 100
+iterations per thread with
+.B 2
+threads on the socket 0 (
+.B S0
+) and a data size of
+.B 1GB.
 .TP
-.B likwid-bench -t triad -i 100 -g 1 -w S0:1GB:2:1:2
+.B likwid-bench -t triad -i 100 -w S0:1GB:2:1:2
 .PP
-Assuming socket 0 has 4 SMT threads, one thread is assigned to each physical core of socket 0.
+Assuming socket 0 (
+.B S0
+) has 2 physical cores with SMT enabled, hence in total 4 hardware threads, one thread is assigned to each physical core of socket 0.
 .IP 3. 4
-Run the update benchmark with 1000 iterations on socket 0 with a workload of 100kB and on socket 1 with the same workload.
+Run the
+.B update
+benchmark on socket 0 (
+.B S0
+) with a workload of
+.B 100kB
+and on socket 1 (
+.B S1
+) with the same workload.
 .TP
-.B likwid-bench -t update -i 1000 -g 2 -w S0:100kB -w S1:100kB
+.B likwid-bench -t update -w S0:100kB -w S1:100kB
 .PP
 The results of both workgroups are combinded for the output. Hence the workload in each workgroup expression should have the same size.
 .IP 4. 4
-Run the copy benchmark but measure the memory traffic with
+Run the
+.B copy
+benchmark but measure the memory traffic with
 .B likwid-perfctr.
-The option INSTRUMENT_BENCH in config.mk needs to be true at compile time to use that feature.
+The option
+.B INSTRUMENT_BENCH
+in
+.B config.mk
+needs to be true at compile time to use that feature.
 .TP
-.B likwid-perfctr -C E:S0:4 -g MEM -m likwid-bench -t update -i 1000 -g 1 -w S0:100kB
+.B likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB
 .PP
-.B likwid-perfctr 
-will configure and start the performance counters on socket 0 with 4 threads prior to the execution of
+.B likwid-perfctr
+will configure and start the performance counters on socket 0 (
+.B S0
+) with 4 threads prior to the execution of
 .B likwid-bench.
-The performance counters are read right before and after running the benchmarking code to 
-minimize the interferences of the measurement.
+The performance counters are read right before and after running the benchmarking code to minimize the interferences of the measurement.
 .IP 5. 4
-Run the copy benchmark and place the data on other socket
+Run the
+.B copy
+benchmark and place the data on another socket
 .TP
-.B likwid-bench -t copy -i 50 -g 1 -w S0:1GB:10:1:2-0:S1,1:S1
+.B likwid-bench -t copy -w S0:1GB:10:1:2-0:S1,1:S1
 .PP
-Stream id 0 and 1 are placed in thread domains S1, which is socket 1. This can be verified as the initialization threads output where they are running.
+Stream id 0 and 1 are placed in thread domains
+.B S1,
+which is socket 1. This can be verified as the initialization threads output where they are running.
 
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH SEE ALSO
-likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-setFrequencies(1)
diff --git a/doc/likwid-doxygen.md b/doc/likwid-doxygen.md
new file mode 100644
index 0000000..37d505e
--- /dev/null
+++ b/doc/likwid-doxygen.md
@@ -0,0 +1,262 @@
+/*! \mainpage LIKWID - Like I Knew What I Am Doing
+
+\section Introduction
+This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID is on x86 processors some of the tools are portable and not limited to any specific architecture. LIKWID follows the philosophy:
+- Simple
+- Efficient
+- Portable
+- Extensible
+
+\ref build
+
+\ref faq
+
+\section Tools LIKWID Tools
+- \ref likwid-topology : A tool to display the thread and cache topology on multicore/multisocket computers.
+- \ref likwid-pin : A tool to pin your threaded application without changing your code. Works for pthreads and OpenMP.
+- \ref likwid-perfctr : A tool to measure hardware performance counters on recent Intel and AMD processors. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code.
+- \ref likwid-powermeter : A tool for accessing RAPL counters and query Turbo mode steps on Intel processor. RAPL counters are also available in \ref likwid-perfctr.
+- \ref likwid-setFrequencies : A tool to print and manage the clock frequency of CPU cores.
+- \ref likwid-agent : A monitoring agent for LIKWID with multiple output backends.
+- \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks.
+- \ref likwid-bench : A benchmarking framework for streaming benchmark kernels written in assembly.
+- \ref likwid-genTopoCfg : A config file writer that gets system topology and writes them to file for faster LIKWID startup.
+<!-- - \ref likwid-features : A tool to toggle the prefetchers on Core 2 processors.-->
+
+Wrapper scripts using the basic likwid tools:
+- \ref likwid-mpirun : A wrapper script enabling simple and flexible pinning of MPI and MPI/threaded hybrid applications. With integrated \ref likwid-perfctr support.
+- \ref likwid-perfscope : A frontend application for the timeline mode of \ref likwid-perfctr that performs live plotting using gnuplot.
+
+LIKWID requires in most environments some daemon application to perform its operations with higher priviledges:
+- \ref likwid-accessD : Daemon to perform MSR and PCI read/write operations with higher priviledges.
+- \ref likwid-setFreq : Daemon to set the CPU frequencies with higher priviledges.
+
+Optionally, a global configuration file \ref likwid.cfg can be given to modify some basic run time parameters of LIKWID.
+
+\section Library LIKWID Library
+\subsection C_Interface C/C++ Interface
+- \ref MarkerAPI
+- \ref AccessClient
+- \ref Config
+- \ref CPUTopology
+- \ref NumaTopology
+- \ref AffinityDomains
+- \ref PerfMon
+- \ref PowerMon
+- \ref ThermalMon
+- \ref TimerMon
+- \ref Daemon
+- \ref MemSweep
+
+\subsection Lua_Interface Lua Interface
+- \ref lua_Info
+- \ref lua_InputOutput
+- \ref lua_Config
+- \ref lua_Access
+- \ref lua_CPUTopology
+- \ref lua_NumaInfo
+- \ref lua_AffinityInfo
+- \ref lua_Perfmon
+- \ref lua_PowerInfo
+- \ref lua_ThermalInfo
+- \ref lua_Timer
+- \ref lua_MemSweep
+- \ref lua_Misc (Some functionality not provided by Lua natively)
+
+\subsection Fortran90_Interface Fortran90 Interface
+- \ref Fortran_Interface
+
+\section Architectures Supported Architectures
+\subsection Architectures_Intel Intel®
+- \subpage pentiumm
+- \subpage core2
+- \subpage atom
+- \subpage nehalem
+- \subpage nehalemex
+- \subpage westmere
+- \subpage westmereex
+- \subpage phi
+- \subpage silvermont
+- \subpage sandybridge
+- \subpage sandybridgeep
+- \subpage ivybridge
+- \subpage ivybridgeep
+- \subpage haswell
+- \subpage haswellep
+- \subpage broadwell
+
+\subsection Architectures_AMD AMD®
+- \subpage k8
+- \subpage k10
+- \subpage interlagos
+- \subpage kabini
+
+\section Examples Example Codes
+Using the Likwid API:
+- \ref C-likwidAPI-code
+- \ref Lua-likwidAPI-code
+
+Using the Marker API:
+- \ref C-markerAPI-code
+- \ref F-markerAPI-code
+
+If you have problems with LIKWID:<BR>
+GitHub: <A HREF="https://github.com/RRZE-HPC/likwid">https://github.com/RRZE-HPC/likwid</A><BR>
+Bugs: <A HREF="https://github.com/RRZE-HPC/likwid/issues">https://github.com/RRZE-HPC/likwid/issues</A><BR>
+Mailinglist: <A HREF="http://groups.google.com/group/likwid-users">http://groups.google.com/group/likwid-users</A><BR>
+*/
+
+
+/*! \page build Build and install instructions
+\section allg Introduction
+Likwid is build using GNU make and Perl. Besides the Linux kernel and the standard C library, all required dependencies are shipped with the archive (<A HREF="http://www.lua.org/">Lua</A> and <A HREF="http://www.open-mpi.org/projects/hwloc/">hwloc</A>).
+It should build on any Linux distribution with a recent GCC compiler or CLANG compiler and 2.6 or newer kernel without any changes.
+
+There is one generic top level Makefile and one .mk configuration file for each
+compiler (at the moment GCC, CLANG and ICC). Please note that we test LIKWID only with GCC. CLANG and ICC is only tested for basic functionality.
+
+There is one exception: If you want to use LIKWID on a Intel Xeon Phi card you have to choose the MIC as compiler in config.mk, which is based on Intel ICC compiler.
+
+\subsection directory Directory structure
+All source files are in the src/ directory. All header files are located in
+src/includes/ . Lua application source files are in src/applications/. All external tools, namely HWLOC and Lua, are located in ext/. The bench/ folder contains all files of the benchmarking suite of LIKWID.
+
+All build products are generated in the directory ./TAG, where TAG is the compiler configuration, default ./GCC.
+
+\subsection config Configuration
+Usually the only thing you have to configure is the PREFIX install path in the build config file config.mk in the top directory.
+
+\subsubsection color Changing color of <CODE>likwid-pin</CODE> output
+Depending on the background of your terminal window you can choose a color for <CODE>likwid-pin</CODE> output.
+
+\subsubsection accessD Usage of the access daemon likwid-accessD
+Usually on your own system, you can use LIKWID with direct access to the MSR files. If you install LIKWID on a shared system as a HPC compute cluster you may consider to use the access daemon. This is a proxy application which was implemented with security in mind and performs address checks for allowed access. Using the access daemon, the measurements involve more overhead, especially if you use \ref likwid-perfctr in timeline mode or with the marker API.
+
+To enable using the access daemon, configure in config.mk:
+    - Set BUILDDAEMON to true
+    - Configure the path to the accessDaemon binary at ACCESSDAEMON
+    - Set the ACCESSMODE to accessdaemon
+
+ACCESSMODE can be direct, accessdaemon and sysdaemon (not yet officially supported). You can overwrite the default setting on the command line using the -M switch.
+
+If you want to access Uncore performance counters that are located in the PCI memory range, like they are implemented in Intel SandyBridge EP and IvyBridge EP, you have to use the access daemon or have root privileges because access to the PCI space is only permitted for highly privileged users.
+
+\subsubsection setfreqinstall Usage of frequency daemon likwid-setFreq
+The application \ref likwid-setFrequencies uses another daemon to modify the frequency of CPUs. The daemon is build and later installed if BUILDFREQ is set to true in config.mk.
+
+\subsubsection sharedlib Build Likwid as shared library
+Per default the LIKWID library is build as a shared library. You need the library if you want to use the Marker API. You can also use the LIKWID modules like <I>perfmon</I> directly. This is still not officially supported at the moment. In some settings it is necessary to build LIKWID as a shared library. To do so set SHARED_LIBRARY to true.
+
+\subsubsection instr_bench Instrument likwid-bench for usage with likwid-perfctr
+\ref likwid-bench is instrumented for use with \ref likwid-perfctr. This allows you to measure various metrics of your \ref likwid-bench kernels. Enable instrumentation by setting INSTRUMENT_BENCH to true in config.mk.
+
+\subsubsection fortran Enabling Fortran interface for marker API
+If you want to use the Marker API in Fortran programs LIKWID offers a native Fortran90 interface. To enable it set FORTRAN_INTERFACE to true in config.mk.
+
+\subsection targets Build targets
+You have to edit config.mk to configure your build and install path.
+
+The following make targets are available:
+
+- <B>make</B> - Build everything
+- <B>make likwid-bench</B> - Build likwid-bench
+- <B>make likwid-accessD</B> - Build likwid-accessD
+- <B>make likwid-setFreq</B> - Build likwid-setFreq
+- <B>make docs</B> - Create HTML documentation using doxygen
+- <B>make clean</B> - Remove the object file directory *./GCC*, keep the executables
+- <B>make distclean</B> - Remove all generated files
+- <B>make local</B> - Adjust paths in Lua scripts to work from the build directory. Requires the daemons and the pinning library to be already installed. Mainly used for testing.
+
+The build system has a working dependency tracking, therefore <B>make clean</B> is only needed if you change the Makefile configuration.
+
+\subsection installtargets Installing
+
+NOTE: The pinning functionality and the daemons only work if configured in config.mk and
+installed with <B>make install</B>. If you do not use the pinning functionality the tools
+can be used without installation.
+
+ - <B>make install</B> - Installs the executables, libraries, man pages and headers to the path you configured in config.mk.
+ - <B>make uninstall</B> - Delete all installed files.
+
+\subsection accessD Setting up access for hardware performance monitoring
+Hardware performance monitoring on x86 is enabled using model-specific registers (MSR). MSR registers are special registers not part of the instruction set architecture. To read and write to these registers the x86 ISA provides special instructions. These instructions can only be executed in protected mode or in other words only kernel code can execute these instructions. Fortunately, any Linux kernel 2.6 or newer provides access to these registers via a set of device files. This allows  [...]
+
+Per default only root has read/write access to these msr device files. In order to use the LIKWID tools, which need access to these files (likwid-perfctr, likwid-powermeter and likwid-agent) as standard user, you need to setup access rights to these files.
+
+likwid-perfctr, likwid-powermeter and likwid-features require the Linux <CODE>msr</CODE> kernel module. This module is part of most standard distro kernels. You have to be root to do the initial setup.
+
+    - Check if the <CODE>msr</CODE> module is loaded with <CODE>lsmod | grep msr</CODE>. There should be an output.
+    - It the module is not loaded, load it with <CODE>modprobe msr</CODE>. For automatic loading at startup consult your distros documentation how to do so.
+    - Adopt access rights on the MSR device files for normal user. To grant access to anyone, you can use <CODE>chmod o+rw /dev/cpu/*/msr</CODE>. This is only recommended on single user desktop systems.
+
+As in general access to MSRs is not desired on security sensitive systems, you can either implement a more sophisticated access rights settings with e.g. setgid. A common solution used on many other device files, e.g. for audio, is to introduce a group and make a <CODE>chown</CODE> on the msr device files to that group. Now if you execute likwid-perfctr with setgid on that group, the executing user can use the tool but cannot directly write or read the MSR device files.
+
+Some distributions backported the capabilities check for the msr device to older kernels. If there are problems with accessing the msr device for older kernels with file system permissions set to read&write, please check your kernel code (<CODE>arch/x86/kernel/msr.c</CODE>) for the backport and set the MSR capabilities in case.
+
+A secure solution is to use the access daemon \ref likwid-accessD, which encapsulates the access to the MSR device files and performs a address check for allowed registers.
+
+Some newer kernels implement the so-called capabilities, a fine-grained permission system that can allow access to the MSR files for common users. On the downside it may be not enough anymore to set the suid-root flag for the access daemon, the executable must be registerd at the <CODE>libcap</CODE>.
+
+<CODE>sudo setcap cap_sys_rawio+ep EXECUTABLE</CODE>
+
+This is only possible on local file systems. A feasible way is to use the \ref likwid-accessD for all accesses and just enable the capabilities for this one binary. This will enable the usage for all LIKWID tools and also for all instrumented binaries. If \ref likwid-perfctr utility should only be used in wrapper mode, it is suitable to set the capabilities for \ref likwid-perfctr only. Please remember to set the file permission of the MSR device files to read/write for all users, even i [...]
+
+\subsubsection depends Dependencies
+Although we tried to minimize the external dependencies of LIKWID, some advanced tools or only specific tool options require external packages.<BR>
+\ref likwid-perfscope uses the Perl script <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> to forward the real-time data to gnuplot. <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> is included into LIKWID, but <A HREF="http://www.gnuplot.info/">gnuplot</A> itself is not.<BR>
+\ref likwid-agent provided multiple backends to output the periodically measured data. The syslog backend requires the shell tool \a logger to be installed. The <A HREF="https://oss.oetiker.ch/rrdtool/">RRD</A> backend requires \a rrdtool and the GMetric backend the \a gmetric tool, part of the <A HREF="http://ganglia.sourceforge.net/">Ganglia Monitoring System</A>.<BR>
+In order to create the HTML documentation of LIKWID, the tool <A HREF="www.doxygen.org">Doxygen</A> is required.
+*/
+
+/*! \page C-markerAPI-code Marker API in a C/C++ application
+\include C-markerAPI.c
+*/
+
+/*! \page F-markerAPI-code Marker API in a Fortran90 application
+\include F-markerAPI.F90
+*/
+
+/*! \page C-likwidAPI-code LIKWID API in a C/C++ application
+\include C-likwidAPI.c
+*/
+/*! \page Lua-likwidAPI-code LIKWID API in a Lua application
+\include Lua-likwidAPI.lua
+*/
+
+/*! \page faq FAQ
+\section faq1 Which architectures are supported?
+LIKWID supports a range of x86 CPU architectures but likely not all. We concentrated the development effort on Intel and AMD machines. Almost all architecture code is tested. For a list of architectures see section \ref Architectures or call <CODE>likwid-perfctr -i</CODE>.
+
+\section faq2 Are all hardware events supported?
+LIKWID offers almost all events that are defined in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual</A> and the <A HREF="http://developer.amd.com/resources/documentation-articles/developer-guides-manuals/">AMD® BIOS and Kernel Developer’s Guides</A>. Some may be missing caused by special handling likely with additional registers. But, LIKWID also provides some events  [...]
+
+\section faq3 Does LIKWID support Intel's PEBS?
+No, PEBS is an interface that must be initialized at kernel level. Since LIKWID is a user-space tool, there is no possibility to maintain PEBS.
+
+\section faq4 Which unit does LIKWID use internally for B, kB, MB, GB?
+As the units imply, you get from one unit to the other by multiplying or dividing it by 1000. E.g. 1kB = 1000B. There is no kiB or MiB possible by now.
+
+\section faq5 Does LIKWID support power capping (Intel only)?
+No, by now LIKWID does not support limiting the power consumption of your machine using the RAPL interface. We added some functions but they are not exported because they need to be rechecked.
+
+\section faq6 Is LIKWID case-sensitive?
+Yes, all strings are case-sensitive. The only exception are the event options, they are case-insensitive. For upcomming versions we change to case-insensitive for all string parsing where possible.
+
+\section faq7 I have given multiple eventsets on the command line but the values are too low? Are they multiplexed?
+LIKWID does not support multiplexing of eventsets. It rotates through its eventset list and measures each for a specific amount of time. The output contains the results of all measurements of that eventset, no interpolation to the complete runtime is done. Since most other tools that support multiplexing use linear interpolation, you can scale the results yourself with <CODE>(1.0 - (measurement_time/all_time)) * result</CODE>. As you can see, the calculation is pretty simple, but it intr [...]
+
+\section faq8 Are there plans to port LIKWID to other operating systems?
+We do not really plan to port LIKWID to other operating systems. We come from the HPC world and there the main operating systems base on the Linux kernel. The latest Top500 list contains 13 systems using Unix and 1 system with Microsoft® Windows.
+
+\section faq9 Are there plans to port LIKWID to other CPU architectures?
+We would like to port LIKWID to other CPU architectures that support hardware performance measurements but currently there is no time for that and we do not have other architectures than x86 inhouse. We follow the developements and if an architecture gets HPC relevant, we will likely port LIKWID to make it work. The highest probability has ARM and with lower probability we will include SPARC.
+
+\section faq10 Do you plan to introduce a graphical frontend for LIKWID?
+No, we do not!
+
+\section faq12 Why does the startup of likwid-perfctr take so long?
+In order to get reliable time measurements, LIKWID must determine the base clock frequency of your CPU. This is done by a measurement loop that takes about 1 second. You can avoid the measurement loop by creating a topology configuration file with \ref likwid-genTopoCfg.
+
+\section faq13 I want to help, were do I start?
+The best way is to talk to us at the <A HREF="http://groups.google.com/group/likwid-users">mailing list</A>. There are a bunch of small work packages on our ToDo list that can be used as a good starting point for learning how LIKWID works. If you are not a programmer but you have a good idea, let us know and we will discuss it.
+*/
diff --git a/doc/likwid-features.1 b/doc/likwid-features.1
index e67cf44..c73caa9 100644
--- a/doc/likwid-features.1
+++ b/doc/likwid-features.1
@@ -1,35 +1,35 @@
 .TH LIKWID-FEATURES 1 <DATE> likwid\-<VERSION>
 .SH NAME
-likwid-features \- print and toggle the flags of the MSR_IA32_MISC_ENABLE model specific register
+likwid-features \- print and manipulate cpu features like hardware prefetchers
 .SH SYNOPSIS
 .B likwid-features 
-.RB [ \-vh ]
+.RB [ \-vhal ]
 .RB [ \-c
-.IR <coreId> ]
-.RB [ \-s
-.IR <prefetcher_tag> ]
-.RB [ \-u
-.IR <prefetcher_tag> ]
+.IR cpus ]
+.RB [ \-e
+.IR taglist ]
+.RB [ \-d
+.IR taglist ]
 .SH DESCRIPTION
 .B likwid-features
 is a command line application to print the flags in the model
-specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 processors
+specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 and later processors
 it can be used to toggle the hardware prefetch flags. It does not work on AMD processors.
 For a documentation what flags are supported on which processor refer to the Intel
-Software Developer's Manual Volume 3B, Table B.2. The MSR are set individually for every core.
+Software Developer's Manual Volume 3B, Table B.2 and https://software.intel.com/en-us/articles/disclosure-of-hw-prefetcher-control-on-some-intel-processors. The MSR are set individually for every core.
 The following hardware prefetchers can be toggled:
-.IP \[bu] 
+.IP \[bu]
 .B HW_PREFETCHER:
 Hardware prefetcher.
-.IP \[bu] 
+.IP \[bu]
 .B CL_PREFETCHER:
 Adjacent cache line prefetcher.
-.IP \[bu] 
+.IP \[bu]
 .B DCU_PREFETCHER:
 When the DCU prefetcher detects multiple loads from the same line done within a
 time limit, the DCU prefetcher assumes the next line will be required. The next
 line is prefetched in to the L1 data cache from memory or L2.
-.IP \[bu] 
+.IP \[bu]
 .B IP_PREFETCHER:
 The IP prefetcher is an L1 data cache prefetcher. The IP prefetcher looks for
 sequential load history to determine whether to prefetch the next expected data
@@ -43,18 +43,28 @@ prints version information to standard output, then exits.
 .B \-\^h
 prints a help message to standard output, then exits.
 .TP
-.B \-\^c " <coreId>"
-set on which processor core the MSR should be read
+.B \-\^a
+List out the names of all detected features
 .TP
-.B \-\^u " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
-specify which prefetcher to unset
+.B \-\^l
+Print the state of all features for the given CPUs
 .TP
-.B \-\^s " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
-specify which prefetcher to set
+.B \-\^c " cpus"
+set on which processor cores the MSR should be read and written. Syntax according to
+.B likwid-pin(1)
+.TP
+.B \-\^d " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+specify which prefetcher should be disabled. Argument can be a comma-separated list.
+.TP
+.B \-\^e " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
+specify which prefetcher should be enabled. Argument can be a comma-separated list.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Röhl <thomas.Roehl at gmail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwidissues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1), likwid-setFrequencies(1)
+likwid-pin(1), likwid-topology(1), likwid-perfctr(1)
+
+
+
diff --git a/doc/likwid-genCfg.1 b/doc/likwid-genCfg.1
deleted file mode 100644
index 8b7632f..0000000
--- a/doc/likwid-genCfg.1
+++ /dev/null
@@ -1,30 +0,0 @@
-.TH LIKWID-GENCFG 1 <DATE> likwid\-<VERSION>
-.SH NAME
-likwid-genCfg \- Get system topology and write them to file for faster LIKWID startup
-.SH SYNOPSIS
-.B likwid-genCfg
-.RB [\-hv]
-.RB [ \-o
-.IR <filename>]
-.SH DESCRIPTION
-.B likwid-genCfg
-is a command line application that stores the system's CPU and NUMA topology to
-file. LIKWID applications use this file to read in the topology fast instead of
-re-gathering all values. The default output path is /etc/likwid.cfg.
-.SH OPTIONS
-.TP
-.B \-h
-prints a help message to standard output, then exits.
-.TP
-.B \-v
-prints a version message to standard output, then exits.
-.TP
-.B \-\^o " <filename>
-sets output file path (optional)
-
-.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
-.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-.SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1)
diff --git a/doc/likwid-genTopoCfg.1 b/doc/likwid-genTopoCfg.1
new file mode 100644
index 0000000..6d0e8b2
--- /dev/null
+++ b/doc/likwid-genTopoCfg.1
@@ -0,0 +1,30 @@
+.TH LIKWID-GENTOPOCFG 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-genTopoCfg \- Get system topology and write them to file for faster LIKWID startup
+.SH SYNOPSIS
+.B likwid-genTopoCfg
+.RB [\-hv]
+.RB [ \-o
+.IR <filename>]
+.SH DESCRIPTION
+.B likwid-genTopoCfg
+is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of
+re-gathering all values.
+.SH OPTIONS
+.TP
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
+prints a version message to standard output, then exits.
+.TP
+.B \-\^o, \-\-\^output <filename>
+sets output file path (Default: /etc/likwid-topo.cfg)
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
+.SH "SEE ALSO"
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-lua.1 b/doc/likwid-lua.1
new file mode 100644
index 0000000..411531b
--- /dev/null
+++ b/doc/likwid-lua.1
@@ -0,0 +1,111 @@
+.TH LUA 1 "$Date: 2014/12/10 15:55:45 $"
+.SH NAME
+lua \- Lua interpreter
+.SH SYNOPSIS
+.B lua
+[
+.I options
+]
+[
+.I script
+[
+.I args
+]
+]
+.SH DESCRIPTION
+.B lua
+is the standalone Lua interpreter.
+It loads and executes Lua programs,
+either in textual source form or
+in precompiled binary form.
+(Precompiled binaries are output by
+.BR luac ,
+the Lua compiler.)
+.B lua
+can be used as a batch interpreter and also interactively.
+.LP
+The given
+.I options
+are handled in order and then
+the Lua program in file
+.I script
+is loaded and executed.
+The given
+.I args
+are available to
+.I script
+as strings in a global table named
+.BR arg .
+If no options or arguments are given,
+then
+.B "\-v \-i"
+is assumed when the standard input is a terminal;
+otherwise,
+.B "\-"
+is assumed.
+.LP
+In interactive mode,
+.B lua
+prompts the user,
+reads lines from the standard input,
+and executes them as they are read.
+If the line contains an expression or list of expressions,
+then the line is evaluated and the results are printed.
+If a line does not contain a complete statement,
+then a secondary prompt is displayed and
+lines are read until a complete statement is formed or
+a syntax error is found.
+.LP
+At the very start,
+before even handling the command line,
+.B lua
+checks the contents of the environment variables
+.B LUA_INIT_5_3
+or
+.BR LUA_INIT ,
+in that order.
+If the contents is of the form
+.RI '@ filename ',
+then
+.I filename
+is executed.
+Otherwise, the string is assumed to be a Lua statement and is executed.
+.SH OPTIONS
+.TP
+.BI \-e " stat"
+execute statement
+.IR stat .
+.TP
+.B \-i
+enter interactive mode after executing
+.IR script .
+.TP
+.BI \-l " name"
+execute the equivalent of
+.IB name =require(' name ')
+before executing
+.IR script .
+.TP
+.B \-v
+show version information.
+.TP
+.B \-E
+ignore environment variables.
+.TP
+.B \-\-
+stop handling options.
+.TP
+.B \-
+stop handling options and execute the standard input as a file.
+.SH "SEE ALSO"
+.BR luac (1)
+.br
+The documentation at lua.org,
+especially section 7 of the reference manual.
+.SH DIAGNOSTICS
+Error messages should be self explanatory.
+.SH AUTHORS
+R. Ierusalimschy,
+L. H. de Figueiredo,
+W. Celes
+.\" EOF
diff --git a/doc/likwid-memsweeper.1 b/doc/likwid-memsweeper.1
index f474360..fda87f4 100644
--- a/doc/likwid-memsweeper.1
+++ b/doc/likwid-memsweeper.1
@@ -5,24 +5,24 @@ likwid-memsweeper \- A tool to clean up NUMA memory domains and last level cache
 .B likwid-memsweeper
 .RB [\-hv]
 .RB [ \-c
-.IR <NUMA_ID> ]
+.IR <node_list> ]
 .SH DESCRIPTION
 .B likwid-memsweeper
-is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover the tool invalidates all cachelines in the LLC for 64 bit x86 systems. If no NUMA domain is specified, all are sweept.
+is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
 .SH OPTIONS
 .TP
-.B \-h
+.B \-h, \-\-\^help
 prints a help message to standard output, then exits.
 .TP
-.B \-v
+.B \-v, \-\-\^version
 prints a version message to standard output, then exits.
 .TP
-.B \-\^c " <NUMA_ID>
+.B \-\^c <node_list>
 set the NUMA domain for sweeping.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1),
diff --git a/doc/likwid-mpirun.1 b/doc/likwid-mpirun.1
index 765b0c8..e3db441 100644
--- a/doc/likwid-mpirun.1
+++ b/doc/likwid-mpirun.1
@@ -3,7 +3,9 @@
 likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
 .SH SYNOPSIS
 .B likwid-memsweeper
-.RB [\-hd]
+.RB [\-hvdOm]
+.RB [ \-n
+.IR number_of_processes ]
 .RB [ \-hostfile
 .IR filename ]
 .RB [ \-nperdomain
@@ -11,9 +13,11 @@ likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
 .RB [ \-pin
 .IR expression ]
 .RB [ \-omp
-.IR expression ]
+.IR omptype ]
 .RB [ \-mpi
-.IR expression ]
+.IR mpitype ]
+.RB [ \-g
+.IR eventset ]
 .RB [\-\-]
 .SH DESCRIPTION
 .B likwid-mpirun
@@ -22,32 +26,51 @@ is a command line application that wraps the vendor-specific mpirun tool and add
 to the execution string. The user-given application is ran, measured and the results returned to the staring node.
 .SH OPTIONS
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits
+.TP
+.B \-\^v,\-\-\^version
+prints version information to standard output, then exits
 .TP
-.B \-d
-prints debug messages to standard output.
+.B \-\^d,\-\-\^debug
+prints debug messages to standard output
 .TP
-.B \-\^hostfile " filename
-specifies the nodes to schedule the MPI processes on
+.B \-\^n,\-\^np,\-\-\^n,\-\-\^np <number_of_processes>
+specifies how many MPI processes should be started
 .TP
-.B \-\^nperdomain " number_of_processes_in_domain
+.B \-\^hostfile <filename>
+specifies the nodes to schedule the MPI processes on. If not given, the environment variables PBS_NODEFILE, LOADL_HOSTFILE and SLURM_HOSTFILE are checked.
+.TP
+.B \-\^nperdomain <number_of_processes_in_domain>
 specifies the processes per affinity domain (see
 .B likwid-pin
 for info about affinity domains)
 .TP
-.B \-\^pin " expression
+.B \-\^pin <expression>
 specifies the pinning for hybrid execution (see
 .B likwid-pin
 for info about affinity domains)
 .TP
-.B \-\^omp " expression
-enables hybrid setup. Can only be used in combination with
-.B -pin.
-The only possible value is: intel
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
+.TP
+.B \-\^omp <omptype>
+enables hybrid setup. Likwid tries to determine OpenMP type automatically. The only possible value are
+.B intel
+and
+.B gnu
 .TP
-.B \-\^mpi " expression
-specifies the MPI implementation that should be used by the wrapper. Possible values are intelmpi, openmpi and mvapich2
+.B \-\^mpi <mpitype>
+specifies the MPI implementation that should be used by the wrapper. Possible values are
+.B intelmpi, openmpi
+and
+.B mvapich2
+.TP
+.B \-\^m,\-\-\^marker
+activates the Marker API for the executed MPI processes
+.TP
+.B \-\^O
+prints output in CSV not ASCII tables
 .TP
 .B \-\-
 stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
@@ -56,26 +79,32 @@ stops parsing arguments for likwid-mpirun, in order to set options for underlyin
 .IP 1. 4
 For standard application:
 .TP
-.B likwid-mpirun -np 32  ./myApp
+.B likwid-mpirun -np 32 ./myApp
 .PP
 Will run 32 MPI processes, each host is filled with as much processes as written in ppn
 .IP 2. 4
 With pinning:
 .TP
-.B likwid-mpirun -np 32 -nperdomain S:2  ./myApp
+.B likwid-mpirun -np 32 -nperdomain S:2 ./myApp
 .PP
 Will start 32 MPI processes with 2 processes per socket.
 .IP 3. 4
 For hybrid runs:
 .TP
-.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3  ./myApp
+.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3 ./myApp
 .PP
 Will start 32 MPI processes with 2 processes per node. Threads of the first process are pinned to the cores 0-3 in NUMA domain 0 (M0). The OpenMP threads of the second process are pinned to the first four cores in NUMA domain 1 (M1)
-
+.SH BUGS
+When measuring Uncore events it is not possible to select a cpu pin expression
+that covers multiple sockets, e.g. S0:0-1_S0:2 at S1:2. This runs two processes,
+each running on two CPUs. But since the first CPU of the second expression is on
+socket 0, which is already handled by S0:0-1, the second MPI process gets a
+event set that does not contain Uncore counters although the second part of the
+second expression would measure the Uncore counters on socket 1.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-pin(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1
index ea3e4f3..321da34 100644
--- a/doc/likwid-perfctr.1
+++ b/doc/likwid-perfctr.1
@@ -1,152 +1,110 @@
 .TH LIKWID-PERFCTR 1 <DATE> likwid\-<VERSION>
 .SH NAME
-likwid-perfctr \- configure and read out hardware performance counters on x86 cpus
+likwid-perfctr \- configure and read out hardware performance counters on x86 CPUs
 .SH SYNOPSIS
 .B likwid-perfctr 
-.RB [\-vhHVmaeiMoO]
-.RB [ \-c/\-C
-.IR <core_list> ]
+.RB [\-vhHmaief]
+.RB [ \-c
+.IR core_list ]
+.RB [ \-C
+.IR core_list_for_pinning ]
 .RB [ \-g
-.IR <performance_group>
+.IR performance_group
 or
-.IR <performance_event_string> ]
+.IR performance_event_string ]
 .RB [ \-t
-.IR <frequency> ]
+.IR timeline_frequency ]
 .RB [ \-S
-.IR <time> ]
-.RB [ \-s
-.IR <skip_mask> ]
+.IR monitoring_time ]
+.RB [ \-T
+.IR group_switch_frequency ]
+.RB [ \-V
+.IR verbosity ]
+.RB [ \-M
+.IR access_mode ]
 .RB [ \-o
-.IR <output_file> ]
+.IR output_file ]
+.RB [ \-s
+.IR skip_mask ]
+.RB [ \-E
+.IR search_str ]
 .SH DESCRIPTION
 .B likwid-perfctr
 is a lightweight command line application to configure and read out hardware performance monitoring data
 on supported x86 processors. It can measure either as wrapper without changing the measured application
 or with marker API functions inside the code, which will turn on and off the counters. There are preconfigured
-groups with useful event sets and derived metrics. Additonally arbitrary events can be measured with
-custom event sets. The marker API can measure mulitple named regions. Results are accumulated on multiple calls.
-The following x86 processors are supported:
-.IP \[bu] 
-.B Intel Core 2:
-all variants. Counters:
-.I PMC[0-1], FIXC[0-2]
-.IP \[bu] 
-.B Intel Nehalem:
-Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu] 
-.B Intel Nehalem EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu] 
-.B Intel Westmere:
- Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu] 
-.B Intel Westmere EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu] 
-.B Intel Sandy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Sandy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]. MBOX[0-3]C[0-3]
-.IP \[bu] 
-.B Intel Ivy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Ivy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3], CBOX[0-9]C[0-3], MBOX[0-3]C[0-3], MBOX[0-3]FIX
-.IP \[bu] 
-.B Intel Haswell:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Haswell EP:
-no uncore support, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Atom Silvermont:
-full RAPL support. Counters:
-.I PMC[0-1], FIXC[0-2], PWR[0-1]
-.IP \[bu] 
-.B Intel Pentium M:
-Banias and Dothan variants. Counters:
-.I PMC[0-1]
-.IP \[bu] 
-.B Intel P6:
-Tested on P3.
-.IP \[bu] 
-.B AMD K8:
-all variants. Counters:
-.I PMC[0-3]
-.IP \[bu] 
-.B AMD K10:
-Barcelona, Shanghai, Istanbul, MagnyCours based processors. Counters:
-.I PMC[0-3]
+performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The marker API can measure mulitple named regions and the results are accumulated over multiple region calls.
 
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^v, \-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
+.B \-\^h, \-\-\^help
 prints a help message to standard output, then exits.
 .TP
 .B \-\^H
 prints group help message (use together with -g switch).
 .TP
-.B \-\^V
-verbose output during execution for debugging.
+.B \-\^V <level>, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
 .TP
 .B \-\^m
 run in marker API mode
 .TP
 .B \-\^a
-print available performance groups for current processor.
+print available performance groups for current processor, then exit.
 .TP
 .B \-\^e
 print available counters and performance events of current processor.
 .TP
-.B \-\^o " <filename>
+.B \-\^o, \-\-\^output <filename>
 store all ouput to a file instead of stdout. For the filename the following placeholders are supported: 
-%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h hostname and %p for process pid.
+%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h host name and %p for process pid.
 The placeholders must be separated by underscore as, e.g., -o test_%h_%p. You must specify a suffix to
 the filename. For txt the output is printed as is to the file. Other suffixes trigger a filter on the output.
 Available filters are csv (comma separated values) and xml at the moment.
 .TP
 .B \-\^O
-Do not print tables for results, use easily parseable CSV instead.
+print output in CSV format (conform to RFC 4180, see
+.I https://tools.ietf.org/html/rfc4180
+for details).
 .TP
-.B \-\^i
-print cpuid information about processor and on Intel Performance Monitoring features, then exit.
+.B \-\^i, \-\-\^info
+print cpuid information about processor and about Intel Performance Monitoring features, then exit.
 .TP
-.B \-\^c " <processor_list>"
+.B \-\^c <cpu expression>
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11.
 .TP
-.B \-\^C " <processor_list>"
+.B \-\^C <cpu expression>
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11. This variant will
 also pin the threads to the cores. Also logical numberings can be used.
 .TP
-.B \-\^g " <performance group> or <performance event set string>"
+.B \-\^g, \-\-\^group <performance group> or <performance event set string>
 specify which performance group to measure. This can be one of the tags output with the -a flag.
 Also a custom event set can be specified by a comma separated list of events. Each event has the format
 eventId:register with the the register being one of a architecture supported performance counter registers.
 .TP
-.B \-\^t " <frequency of measurements>"
-timeline mode for time resolved measurements, possible suffixes 's' and 'ms' like 100ms. The output has the format:
+.B \-\^t <frequency of measurements>
+timeline mode for time resolved measurements. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^S <waittime between measurements>
+End-to-end measurement using likwid-perfctr but sleep instead of executing an application. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^T <time between group switches>
+Frequency to switch groups if multiple are given on commandline, default is 2s. Value is ignored for a single event set and default frequency of 30s is used to catch overflows. The time unit must be given on command line, e.g. 4s, 500ms or 900us.
+.TP
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
 .TP
-.B <Event> <Timestamp> <Result thread0> <Result thread1> ...
+.B \-\^f, \-\-\^force
+Force writing of registers even if they are in use.
 .TP
-.B \-\^S " <time_in_seconds>"
-stethoscope mode with duration in senconds. Can be used to measure an application from the outside.
+.B \-\^E <search_str>
+Print only events and corresponding counters matching <search_str>
 
 .SH EXAMPLE
 Because 
@@ -163,7 +121,7 @@ The parent process is pinned to processor 0, Thread 0 to processor 1 and Thread
 .IP 2. 4
 As wrapper with custom event set on AMD:
 .TP
-.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./myApp
+.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
 .PP
 It is specified that the event
 .B INSTRUCTIONS_RETIRED_SSE
@@ -173,84 +131,116 @@ and the event
 .B CPU_CLOCKS_UNHALTED
 on counter
 .B PMC3.
-It is possible calculate the runtime of all threads based on the
+It is possible calculate the run time of all threads based on the
 .B CPU_CLOCKS_UNHALTED
 event. If you want this you have to include this event in your custom event string as shown above.
 
 .IP 3. 4
 As wrapper with custom event set on Intel:
 .TP
-.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1 ./myApp
+.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./stream-icc
 .PP
 On Intel processors fixed events are measured on dedicated counters. These are
 .B INSTR_RETIRED_ANY
-,
-.B CPU_CLK_UNHALTED_CORE.
 and
-.B CPU_CLK_UNHALTED_REF
+.B CPU_CLK_UNHALTED_CORE.
 If you configure these fixed counters, 
 .B likwid-perfctr
-will calculate the runtime and CPI metrics for your run.
+will calculate the run time and CPI metrics for your run.
 
 .IP 4. 4
 Using the marker API to measure only parts of your code (this can be used both with groups or custom event sets):
 .TP
 .B likwid-perfctr -m -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
 .PP
-You have to link you code against liblikwid.a/.so and use the marker API calls.
+You have to link you code against liblikwid.so and use the marker API calls.
+Examples can be found in examples folder <INSTALLEDPREFIX>/share/likwid/examples.
 The following code snippet shows the necessary calls:
 
 .nf
 #include <likwid.h>
 
 /* only one thread calls init */
-if (threadId == 0)
-{
-    likwid_markerInit();
-}
-/* if you want to measure an threaded application
- * you have to call likwid_markerThreadInit() for
- * preparation, example with OpenMP */
-#pragma omp parallel
-{
-	likwid_markerThreadInit();
-}
-BARRIER;
-likwid_markerStartRegion("Benchmark");
-/* your code to be measured is here.*/
+LIKWID_MARKER_INIT;
+
+/* Must be called by each thread the should 
+ * perform measurements.
+ * If you place it in the same parallel
+ * region as LIKWID_MARKER_START, perform a
+ * barrier between the statements to avoid
+ * timing problems.
+ */
+LIKWID_MARKER_THREADINIT;
+
+/* If you run the code region only once, register
+ * the region tag previously to reduce the overhead
+ * of START and STOP calls. Call it once for each
+ * thread in parallel environment.
+ * Note: No whitespace characters are allowed in the region tags
+ * This call is optional, START will do the same operations.
+ */
+LIKWID_MARKER_REGISTER("name");
 
-likwid_markerStopRegion("Benchmark");
-BARRIER;
-/* again only one thread can close the markers */
-if (threadId == 0)
-{
-    likwid_markerClose();
-}
+/* Start measurement 
+ * Note: No whitespace characters are allowed in the region tags
+ */
+LIKWID_MARKER_START("name");
+/*
+ * Your code to be measured is here
+ * You can also nest named regions
+ * No whitespaces are allowed in the region names!
+ */
+LIKWID_MARKER_STOP("name");
+
+/* If you want to measure multiple groups/event sets
+ * Switches through groups in round-robin fashion
+ */
+LIKWID_MARKER_SWITCH;
+
+/* Finally */
+LIKWID_MARKER_CLOSE;
 .fi
 
 .IP 5. 4
 Using likwid in timeline mode:
 .TP
-.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms  ./myApp > out.txt
+.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms ./cacheBench > out.txt
 .PP
 This will read out the counters every 300ms on physical cores 0-3 and write the results to out.txt.
-For timeline mode there is a frontend application likwid-scope, which enables live plotting of selected events.
-For more code examples have a look at the likwid WIKI pages. The processes are
-.B not
-pinned to the CPUs 0-3.
+The application is not pinned to the CPUs. The output syntax of the timeline
+mode is for custom event sets:
+
+.B <groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event2_Thread1> ... <Event1_Thread2> ... <EventN_ThreadM>
+
+For performance groups with metrics:
+.B <groupID> <numberOfMetrics> <numberOfThreads> <Timestamp> <Metric1_Thread1> <Metric2_Thread1> ... <Metric1_Thread2> ...<MetricN_ThreadM>
+
+For timeline mode there is a frontend application likwid-perfscope(1), which enables live plotting of selected events. Please be aware that with high frequencies (<100ms), the values differ from the real results but the behavior of them is valid. 
 
 .IP 6. 4
 Using likwid in stethoscope mode:
 .TP
 .B likwid-perfctr -c 0-3 -g FLOPS_DP -S 2s
 .PP
-This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout. The processes are
-.B not
-pinned to the CPUs 0-3.
+This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout.
+
+.IP 7. 4
+Using likwid with counter options:
+.TP
+.B likwid-perfctr -c S0:1 at S1:1 -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 ./cacheBench
+.PP
+This will program the counter
+.B CBOX0C0
+(the counter 0 of the LLC cache box 0) to measure the event
+.B LLC_LOOKUPS_DATA_READ
+and filter the increments by the state of a cacheline.
+.B STATE=0x9
+for this event means all <invalid> and <modified> cachelines. Which options are allowed for which box is listed in LIKWID's html documentation. The values for the options can be found in the vendors performance monitoring documentations. Likwid measures the first CPU of socket 0 and the first CPU of socket 1. See likwid-pin(1) for details regarding the cpu expressions.
+For more code examples have a look at the likwid WIKI pages and LIKWID's html documentation.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH SEE ALSO
-likwid-topology(1), likwid-features(1), likwid-pin(1), likwid-bench(1)
+likwid-topology(1), likwid-perfscope(1), likwid-pin(1), likwid-bench(1)
diff --git a/doc/likwid-perfscope.1 b/doc/likwid-perfscope.1
index 2d48e21..19886a8 100644
--- a/doc/likwid-perfscope.1
+++ b/doc/likwid-perfscope.1
@@ -1,55 +1,177 @@
 .TH LIKWID-PERFSCOPE 1 <DATE> likwid\-<VERSION>
 .SH NAME
 likwid-perfscope \- Frontend for the timeline mode of
-.N likwid-perfctr(1)
-that on-the-fly generates pictures from the measurements
+.B likwid-perfctr(1)
+that generates pictures on-the-fly from the measurements
 .SH SYNOPSIS
-.B likwid-perfscope 
-.RB [\-h]
-.RB [ \-cores
+.B likwid-perfscope
+.RB [\-hvadp]
+.RB [ \-c
+.IR <cpu_list> ]
+.RB [ \-C
 .IR <cpu_list> ]
-.RB [ \-freq
+.RB [ \-t
 .IR <frequency> ]
-.RB [ \-group
-.IR <eventset> ]
+.RB [ \-r
+.IR <value> ]
+.RB [ \-g
+.IR <eventset_and_plotconfig> ]
+.RB [ \-\-\^host
+.IR <hostname> ]
+.B <executable>
+
 .SH DESCRIPTION
 .B likwid-perfscope
-is a command line application written in Perl that uses the timeline daemon mode of
+is a command line application written in Lua that uses the timeline daemon mode of
 .B likwid-perfctr(1)
 to create on-the-fly pictures with the current measurements. It uses the
 .B feedGnuplot(1)
-script to send the current data to gnuplot.
+script to send the current data to gnuplot. Since the plot windows are normally closed directly after the execution of the monitored applications,
+.B likwid-perfscope
+waits until Ctrl+c is pressed.
 .SH OPTIONS
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+Prints a help message to standard output, then exits.
 .TP
-.B \-\^cores " <cpu_list>
-measures the given group on given CPUs in <cpu_list>
+.B \-\^v,\-\-\^version
+Prints version information to standard output, then exits.
+.TP
+.B \-\^C " <cpu_list>
+Measures on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax.
+.TP
+.B \-\^C " <cpu_list>
+Measures the given group on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax. The application is pinned to that cores.
+.TP
+.B \-\^a,\-\-\^all
+List preconfigured event and plot configurations
+.TP
+.B \-\^d,\-\-\^dump
+Print the measurements of
+.B likwid-perfctr(1)
+to stdout.
 .TP
-.B \-\^freq " <frequency>
-reads the current performance values every <frequency>. Available suffixes are 's' and 'ms', e.g. 500ms. Default value is 1s
+.B \-\^t,\-\-\^time " <frequency>
+Reads the current performance values every <frequency>. Available suffixes are 's', 'ms' or 'us, e.g. 500ms. Default value is 1s.
 .TP
-.B \-\^group " <eventset>
-defines the events and counters that should be read. Possible values can be gathered from
+.B \-\^g,\-\-\^group " <eventset_and_plotconfig>
+Defines the events and counters that should be read. Possible values can be gathered from
 .B likwid-perfctr(1).
-Default is group 'FLOPS_DP'
+You can give multiple
+.B \-\^g
+options on the commandline. They will be measured in a round-robin fashion and one plot generated per option. Moreover, the
+.B \-\^g
+option accepts config options for
+.B feedGnuplot(1),
+see section
+.B EVENTSETS
+.TP
+.B \-\^r,\-\-\^range " <value>
+Plot only the last <value> values. Often refered to as sliding window.
+.TP
+.B \-\^p,\-\-\^plotdump
+Use the dumping feature of feedGnuplot to print out the plot configuration and its data at each timestep.
+Can be used to create file-based plots afterwards.
+.TP
+.B \-\-\^host " <hostname>
+Instead of performing likwid-perfctr on the local machine, execute it on a remote machine and plot data locally. Uses ssh and you probably need to enter the password before starting. You can also give something like user at host.
+
+
+.SH EVENTSETS
+In contrast to the \-\^g option for
+.B likwid-perfctr
+the \-\^g option for
+.B likwid-perfscope
+is extended to accept configuration options for
+.B feedGnuplot.
+There are some predefined plot configurations embedded into
+.B likwid-perfscope
+which can be listed with the
+.B \-\^a
+command line option. They are filtered to show only configs that are available for your current system.
+If you need to measure and plot custom events you can set the plotting options as last entry in your eventset. The plotting config options can be set as a ':' separated list. If you select preconfigured group, you can overwrite single fields in the config like changing the title or the matching. The folling options are available:
 
-.SH EXAMPLE
 .IP 1. 4
-Monitor double precision floating-point operations:
+.B title=<string>, TITLE=<string>
 .TP
-.B likwid-perfscope -group FLOPS_DP -cores 0-3 -freq 500ms
+Use the given title for the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 2. 4
+.B xtitle=<string>, XTITLE=<string>
+.TP
+Use the given title for the x-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 3. 4
+.B ytitle=<string>, YTITLE=<string>
+.TP
+Use the given title for the left y-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 4. 4
+.B <string>=<string>
+.TP
+All option string items that is not recognized as keyword like TITLE are used as formulas for the output. You can set multiple of those items in one option string. Each is calculated and integrated in the output plot. The first <string> is used as legend entry. The second <string> is the formula for the function.
+.PP
+.IP 5. 4
+.B y2title=<string>, Y2TITLE=<string>, y2title=<id-string>, Y2TITLE=<id-string>
+.TP
+Use the given title for the right y-axis of the plot. If no id is set, the last y2-axis is related to the last formula. If id is set, the formula with the id is used for the y2-axis. The id starts with index 1 for the first formula. Use "" to enclose text with spaces and escape characters which could be interpreted by the shell with '\'. ':' are not allowed!
+.PP
+
+.SH EXAMPLE
+.IP 1. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L3 -C 0-2 -t 1s ./a.out
+.PP
+This measures the L3 bandwidth with likwid-perfctr every second on CPU cores 0,1,2 and use the plotting configuration L3. The plot will have a title and the axes are labeled properly.
+.IP 2. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L2:TITLE="My Title" -C 0 -t 1s ./a.out
+.PP
+This measures the L2 bandwidth with likwid-perfctr every second on CPU core 0 and use the plotting configuration L2. The title of the output plot is changed to the custom title "My Title".
+.IP 3. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="Cycles per Instruction" -C 0 --time 500ms ./a.out
 .PP
 Executes
 .B likwid-perfctr
-on the first four cores. The values are read every 500ms are forwarded to gnuplot using the
-.B feedGnuplot
-script.
+on the first core. The values for the events
+.B INSTR_RETIRED_ANY
+and
+.B CPU_CLK_UNHALTED_CORE
+are read every 500ms. The raw values are transformed using the formula
+.B FIXC0/FIXC1
+and forwarded to gnuplot using the
+.B feedGnuplot(1)
+script with the curve name 'CPI' in the legend. The y-axis is labeled with the string "Cycles per Instruction".
+IP 4. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g L3,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 500ms ./a.out
+.PP
+This measures the L3 bandwidth for CPU 0 every 500 ms. Additionally, a second curve is plotted with the function
+.B FIXC0/FIXC1
+with the legend entry
+.B CPI.
+The right y-axis is labeled with
+.B 'Cycles per Instruction'
+and is associated to the second formula. The first formula is hidden in the
+.B L3
+plot group. Since the
+.B CPI
+formula is the last in the list, the curve id is not needed in the
+.B Y2TITLE
+as this is the default behavior.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), feedGnuplot(1)
diff --git a/doc/likwid-pin.1 b/doc/likwid-pin.1
index efea873..4822f8d 100644
--- a/doc/likwid-pin.1
+++ b/doc/likwid-pin.1
@@ -2,138 +2,182 @@
 .SH NAME
 likwid-pin \- pin a sequential or threaded application to dedicated processors
 .SH SYNOPSIS
-.B likwid-pin 
-.RB [\-vhqipS]
+.B likwid-pin
+.RB [\-vhSpqi]
+.RB [ \-V
+.IR verbosity ]
 .RB [ \-c
-.IR <core_list> ]
+.IR corelist ]
 .RB [ \-s
-.IR <skip_mask> ]
-.RB [ \-d
-.IR <delimiter> ]
+.IR skip_mask ]
 .SH DESCRIPTION
 .B likwid-pin
-is a command line application to pin a sequential or multithreaded 
-applications to dedicated processors. It can be used as replacement for 
-.B taskset(1).
+is a command line application to pin a sequential or multithreaded
+application to dedicated processors. It can be used as replacement for taskset.
 Opposite to taskset no affinity mask but single processors are specified.
-For multithreaded applications based on the pthread library the 
-.I pthread_create
+For multithreaded applications based on the pthread library the
+.B pthread_create
 library call is overloaded through LD_PRELOAD and each created thread is pinned
-to a dedicated processor as specified in 
-.I core_list
-.
+to a dedicated processor as specified in
+.I core_list .
 .PP
-Per default every generated thread is pinned to the core in the order of calls 
-to 
-.I pthread_create.
-It is possible to skip single threads using -s commandline option.
+Per default every generated thread is pinned to the core in the order of calls to
+.B pthread_create
+it is possible to skip single threads.
 .PP
-For OpenMP implementations gcc and icc compilers are explicitly supported. Others may also work.
+The OpenMP implementations of GCC and ICC compilers are explicitly supported.
+Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library.
+Others may also work
 .B likwid-pin
-sets the environment variable OMP_NUM_THREADS for you if not already present.
-It will set as many threads as present in the pin expression.  Be aware that
+sets the environment variable
+.B OMP_NUM_THREADS
+for you if not already present.
+It will set as many threads as present in the pin expression. Be aware that
 with pthreads the parent thread is always pinned. If you create for example 4
 threads with
-.I pthread_create 
-and do not use the parent process as worker you
-still have to provide num_threads+1 processor ids.
+.B pthread_create
+and do not use the parent process as worker you still have to provide
+.B num_threads+1
+processor ids.
 .PP
 .B likwid-pin
-supports different numberings for pinning. Per default physical numbering of
-the cores is used.  This is the numbering also 
-.B likwid-topology(1)
-reports. But also logical numbering inside the node or the sockets can be used.  If using
-with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node.
-Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads
-with -c N:0-7 you get all physical cores.  If you specify -c N:0-15 you get all
-physical cores and all SMT threads. With S you can specify logical numberings
-inside sockets, again physical cores come first. You can mix different domains
-separated with @. E.g. -c S0:0-3 at S2:2-3 you pin thread 0-3 to logical cores 0-3 on socket 0
-and threads 4-5 on logical cores 2-3 on socket 2.
+supports different numberings for pinning. See section
+.B CPU EXPRESSION
+for details.
 .PP
-For applications where first touch policy on numa systems cannot be employed
+For applications where first touch policy on NUMA systems cannot be employed
 .B likwid-pin
 can be used to turn on interleave memory placement. This can significantly
-speed up the performance of memory bound multithreaded codes. All numa nodes
+speed up the performance of memory bound multithreaded codes. All NUMA nodes
 the user pinned threads to are used for interleaving.
 
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
 .TP
-.B \-\^c " <processor_list> OR <thread_expression> OR <scatter policy> "
-specify a numerical list of processors. The list may contain multiple 
-items, separated by comma, and ranges. For example 0,3,9-11. You can also use
-logical numberings, either within a node (N), a socket (S<id>) or a numa domain (M<id>).
-likwid-pin also supports logical pinning within a cpuset with a L prefix. If you ommit this option
-likwid-pin will pin the threads to the processors on the node with physical cores first.
-See below for details on using a thread expression or scatter policy
+.B \-\^c <cpu expression>
+specify a numerical list of processors. The list may contain multiple  items, separated by comma, and ranges. For example 0,3,9-11. Other format are available, see the
+.B CPU EXPRESSION
+section.
 .TP
-.B \-\^s " <skip_mask>
+.B \-\^s, \-\-\^skip <mask>
 Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
 .TP
-.B \-\^S
-All ccNUMA memory domains belonging to the specified threadlist will be cleaned before the run. Can solve file buffer cache problems on Linux.
+.B \-\^S,\-\-\^sweep
+All ccNUMA memory domains belonging to the specified thread list will be cleaned before the run. Can solve file buffer cache problems on Linux.
 .TP
 .B \-\^p
-prints the available thread domains for logical pinning. If used in combination with -c, the physical processor IDs are printed to stdout.
+prints the available thread domains for logical pinning
 .TP
 .B \-\^i
-set numa memory policy to interleave spanning all numa nodes involved in pinning
+set NUMA memory policy to interleave involving all NUMA nodes involved in pinning
 .TP
-.B \-\^q
+.B \-\^q,\-\-\^quiet
 silent execution without output
-.TP
-.B \-\^d " <delimiter>
-set delimiter used to output the physical processor list (-p & -c)
 
+.SH CPU EXPRESSION
+.IP 1. 4
+The most intuitive CPU selection method is a comma-separated list of phyiscal CPU IDs. An example for this is
+.B 0,2
+which schedules the threads on CPU cores 
+.B 0
+and
+.B 2.
+The physical numbering also allows the usage of ranges like
+.B 0-2
+which results in the list
+.B 0,1,2.
+.IP 2. 4
+The CPUs can be selected by their indices inside of an affinity domain. The affinity domain is optional and if not given, Likwid assumes the domain
+.B 'N'
+for the whole node. The format is
+.B L:<indexlist>
+for selecting the CPUs inside of domain
+.B 'N'
+or
+.B L:<domain>:<indexlist>
+for selecting the CPUs inside the given domain. Assuming an virtual affinity domain
+.B 'P'
+that contains the CPUs
+.B 0,4,1,5,2,6,3,7.
+After sorting it to have physical cores first we get:
+.B 0,1,2,3,4,5,6,7.
+The logical numbering
+.B L:P:0-2
+results in the selection
+.B 0,1,2
+from the physical cores first list.
+.IP 3. 4
+The expression syntax enables the selection according to an selection function with variable input parameters. The format is either
+.B E:<affinity domain>:<numberOfThreads>
+to use the first <numberOfThreads> threads in affinity domain <affinity domain> or
+.B E:<affinity domain>:<numberOfThreads>:<chunksize>:<stride>
+to use <numberOfThreads> threads with <chunksize> threads selected in row while skipping <stride> threads in affinity domain <affinity domain>. Examples are
+.B E:N:4:1:2
+for selecting the first four physical CPUs on a system with 2 SMT threads per core or
+.B E:P:4:2:4
+for choosing the first two threads in affinity domain
+.B P,
+skipping 2 threads and selecting again two threads. The resulting CPU list for virtual affinity domain
+.B P
+is
+.B 0,4,2,6
+.IP 3. 4
+The last format schedules the threads not only in a single affinity domain but distributed them evenly over all available affinity domains of the same kind. In contrast to the other formats, the selection is done using the physical cores first and then the SMT threads. The format is
+.B <affinity domain without number>:scatter
+like
+.B M:scatter
+to schedule the threads evenly in all available memory affinity domains. Assuming the two socket domains
+.B S0 = 0,4,1,5
+and
+.B S1 = 2,6,3,7
+the expression
+.B S:scatter
+results in the CPU list
+.B 0,2,1,3,4,6,5,7
 
 .SH EXAMPLE
-.IP 1. 4
+.IP 1. 5
 For standard pthread application:
 .TP
-.B likwid-pin -c 0,2,4-6  ./myApp
+.B likwid-pin -c 0,2,4-6 ./myApp
 .PP
-The parent process is pinned to processor 0. Thread 0 to processor 2, thread
-1 to processor 4, thread 2 to processor 5 and thread 3 to processor 6. If more threads
-are created than specified in the processor list, these threads are pinned to processor 0
-as fallback.
-.IP 2. 4
-For gcc OpenMP as many ids must be specified in processor list as there are threads: 
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c 0,2,1,3  ./myApp
-.IP 3. 4
-Full control over the pinning can be achieved by specifying a skip mask.
-For example the following command skips the pinning of thread 1:
+The parent process is pinned to processor 0 which is likely to be thread 0 in
+.B ./myApp.
+Thread 1 is pinned to processor 2, thread 2 to processor 4, thread 3 to processor 5 and thread 4 to processor 6. If more threads
+are created than specified in the processor list, these threads are pinned to processor 0 as fallback.
+.IP 2. 5
+For selection of CPUs inside of a CPUset only the logical numbering is allowed. Assuming CPUset
+.B 0,4,1,5:
 .TP
-.B OMP_NUM_THREADS=4; likwid-pin -s 0x1 -c 0,2,1,3  ./myApp
-.IP 4. 4
-The -c switch supports the definition of threads in a specific affinity domain like
-NUMA node or cache group. The available affinity domains can be retrieved with the -p switch 
-and no further option on the commandline. The common affinity domains are N (whole Node), 
-SX (socket X), CX (cache group X) and MX (memory group X). Multiple affinity domains 
-can be set separated by @. In order to pin 2 threads on each socket of a 2-socket system:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c S0:0-1 at S1:0-1  ./myApp
-.IP 5. 4
-Another argument definition of the -c switch allows the threads to be pinned according
-to an expression like E:N:4:1:2. The syntax is E:<thread domain>:<number of threads>(:<chunk size>:<stride>).
-The example pins 8 threads with 2 SMT threads per core on a SMT 4 machine:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c E:N:8:2:4  ./myApp
-.IP 6. 4
-The last alternative for the -c switch is the automatic scattering of threads on affinity domains.
-For example to scatter the threads over all memory domains in a system:
+.B likwid-pin -c L:1,3 ./myApp
+.PP
+This command pins
+.B ./myApp
+on CPU
+.B 4
+and the thread started by
+.B ./myApp
+on CPU
+.B 5
+.IP 3. 5
+A common use-case for the numbering by expression is pinning of an application on the Intel Xeon Phi coprocessor with its 60 cores each having 4 SMT threads.
 .TP
-.B OMP_NUM_THREADS=4; likwid-pin -c M:scatter  ./myApp
+.B likwid-pin -c E:N:60:1:4 ./myApp
+.PP
+This command schedules one thread per physical CPU core for
+.B ./myApp.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-taskset(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-setFrequencies(1)
+taskset(1), likwid-perfctr(1), likwid-features(1), likwid-topology(1),
diff --git a/doc/likwid-powermeter.1 b/doc/likwid-powermeter.1
index f4a3ba2..9f35ceb 100644
--- a/doc/likwid-powermeter.1
+++ b/doc/likwid-powermeter.1
@@ -3,49 +3,72 @@
 likwid-powermeter \- A tool to print power and clocking information on Intel CPUs
 .SH SYNOPSIS
 .B likwid-powermeter 
-.RB [ \-vhip ]
+.RB [ \-vhpitf ]
+.RB [ \-V
+.IR verbosity_level ]
 .RB [ \-c
-.IR <socket_list> ]
+.IR socket_list ]
 .RB [ \-s
-.IR <duration_in_seconds> ]
+.IR duration ]
 .RB [ \-M
-.IR <access_mode>]
+.IR <0|1> ]
 .SH DESCRIPTION
 .B likwid-powermeter
-is a command line application to get the energy comsumption of Intel RAPL capable processors. 
-It also prints information about TDP and Turbo Mode steps supported.
+is a command line application to get the Energy comsumption on Intel RAPL capable processors. Currently
+only Intel SandyBridge is supported. It also prints information about TDP and Turbo Mode steps supported.
 The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used
 in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete 
 run. RAPL works on a per package (socket) base.
-Please note that the RAPL counters are also accessible as normal events within
-.B likwid-perfctr.
+Please note that the RAPL counters are also accessible as normal events withing likwid-perfctr.
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
+.TP
+.B \-\^c <socket_list>
+set on which socket(s) the RAPL interface is accessed. List of sockets like 0,1,2 or 0-2 are allowed.
 .TP
-.B \-\^c " <socket_list>"
-set on which sockets the RAPL interface is accessed. comma-separated list of socket IDs
+.B \-\^M <0|1>
+set how MSR registers are accessed, 0=direct, 1=accessDaemon.
+.TP
+.B \-\^s <duration>
+set measure duration in us, ms or s. (default 2s)
 .TP
 .B \-\^p
-prints out information about dynamic clocks and CPI information on the socket measured. Uses likwid-perfctr internally.
+prints out information about dynamic clocks and CPI information on the socket(s) measured.
 .TP
-.B \-\^i
-prints out information TDP and Turbo mode steps
+.B \-\^i,\-\-\^info
+prints out information TDP and Turbo mode steps of all RAPL domains supporting it.
 .TP
-.B \-\^M " <access_mode>"
-set the access method. 0 for direct access to MSR/RAPL registers, 1 for using the accessDaemon.
+.B \-\^t
+prints out the temperature of all CPUs in the system.
 .TP
-.B \-\^s " <duration_in_seconds>
-measure the power for a specific time (default 2s)
+.B \-\^f
+prints out the temperature like
+.B \-\^t
+but used Fahrenheit as temperature unit.
 
+.SH EXAMPLE
+.IP 1. 3
+Measure the power consumption for 4 seconds on socket 1
+.TP
+.B likwid-powermeter -s 4 -c 1
+.PP
+.IP 2. 3
+Use it as wrapper for an application to measure the energy for the whole execution
+.TP
+.B likwid-powermeter -c 1 ./a.out
+.PP
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-pin(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-setFreq.1 b/doc/likwid-setFreq.1
index 87054c7..1ef598c 100644
--- a/doc/likwid-setFreq.1
+++ b/doc/likwid-setFreq.1
@@ -4,7 +4,7 @@ likwid-setFreq \- Mediator for
 .B likwid-setFrequencies(1)
 that performs the actual setting of CPU cores' frequency and governor.
 .SH SYNOPSIS
-.B likwid-setFreq 
+.B likwid-setFreq
 .IR <coreId>
 .IR <frequency>
 .IR [<governor>]
@@ -14,11 +14,13 @@ that performs the actual setting of CPU cores' frequency and governor.
 is a command line application that mediates the request from
 .B likwid-setFrequencies(1)
 because setting a CPU core's frequency and/or governor requires root privileges. This executable must be suid-root.
+.B likwid-setFreq
+works only with the kernel module acpi-cpufreq. The recent intel_pstate module does not allow to set fixed frequencies.
 
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-setFrequencies(1), likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1)
+likwid-setFrequencies(1)
diff --git a/doc/likwid-setFrequencies.1 b/doc/likwid-setFrequencies.1
index b268280..b45fcbe 100644
--- a/doc/likwid-setFrequencies.1
+++ b/doc/likwid-setFrequencies.1
@@ -3,20 +3,30 @@
 likwid-setFrequencies \- print and manage the clock frequency of CPU cores
 .SH SYNOPSIS
 .B likwid-setFrequencies 
-.RB [\-hpl]
+.RB [\-hvplmp]
 .RB [ \-c
-.IR <cpu_list,_socket_list_or_expression> ]
+.IR <cpu_list> ]
 .RB [ \-g
 .IR <governor> ]
-.RB [ \-f
+.RB [ \-f,\-\-\^freq
 .IR <frequency> ]
 .SH DESCRIPTION
 .B likwid-setFrequencies
-is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon 
-.B likwid-setFreq.
-The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With 
+is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+.B likwid-setFreq(1).
+The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With
 .B likwid-setFrequencies
 the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+.B likwid-setFrequencies
+works only with the kernel module
+.B acpi-cpufreq.
+The recent
+.B intel_pstate
+module does not allow to set fixed frequencies. In order to deactivate
+.B intel_pstate
+add 'intel_pstate=disable' to your kernel boot commandline (commonly in grub) and load the
+.B acpi-cpufreq
+module.
 .SH OPTIONS
 .TP
 .B \-h
@@ -28,19 +38,23 @@ prints the current frequencies for all CPU cores
 .B \-l
 prints all configurable frequencies
 .TP
-.B \-\^c " <cpu_list,_socket_list_or_expression>
-set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X). For detailed information about affinity domains see
+.B \-m
+prints all configurable governors
+.TP
+.B \-\^c <cpu_list>
+set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X).
+For detailed information about affinity domains see
 .B likwid-pin(1)
 .TP
-.B \-\^g " <governor>
+.B \-\^g <governor>
 set the governor of all CPU cores inside the affinity domain. Current governors are ondemand, performance, turbo. Default is ondemand
 .TP
-.B \-\^f " <frequency>
+.B \-\^f, \-\-\^freq <frequency>
 set a fixed frequency at all CPU cores inside the affinity domain. Implicitly sets userspace governor for the cores.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-topology.1 b/doc/likwid-topology.1
index 64bc8b4..04ebdc4 100644
--- a/doc/likwid-topology.1
+++ b/doc/likwid-topology.1
@@ -2,41 +2,47 @@
 .SH NAME
 likwid-topology \- print thread and cache topology
 .SH SYNOPSIS
-.B likwid-topology 
+.B likwid-topology
 .RB [\-hvgcC]
+.RB [ \-V
+.IR level ]
 .RB [ \-o
-.IR <filename> ]
+.IR output_file ]
 .SH DESCRIPTION
 .B likwid-topology
-is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
-draw the processor topology of a machine in ASCII art. Beyond topology
-.B likwid-topology
-determines the clock of a processor and prints detailed informations about the caches hierarchy and NUMA structure.
+is a command line application to print the thread and cache
+topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ascii art. Beyond topology
+likwid-topology determines the clock of a processor and prints detailed
+informations about the caches hierarchy.
 .SH OPTIONS
 .TP
-.B \-v
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-V, \-\-\^verbose <level>
+sets the verbosity level of LIKWID's topology backend. Possible levels range from 0 to 3.
 .TP
 .B \-g
 prints topology information in ASCII art. Best viewed with monospaced font.
 .TP
-.B \-c
-prints detailed informations about cache hierarchy
+.B \-c, \-\-\^caches
+prints detailed information about cache hierarchy
 .TP
-.B \-C
-measures and output the processor clock. This involves a longer runtime of
-.B likwid-topology.
+.B \-C, \-\-\^clock
+measures and output the processor clock. This involves a longer run time of likwid-topology.
 .TP
-.B \-\^f " <filename>
-Specify output file for topology information. According to the file suffix, the information
-is converted using converter scripts installed at <PREFIX>/share/likwid
+.B \-o, \-\-\^output <file>
+write the output to file instead of stdout.
+Likwid applies filter scripts according to filename suffix.
+Currently available scripts are csv. You can place additional filter scripts in <INSTALLEDPREFIX>/share/likwid/filter.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-features(1), likwid-pin(1),
diff --git a/doc/likwid.cfg.md b/doc/likwid.cfg.md
new file mode 100644
index 0000000..2122dee
--- /dev/null
+++ b/doc/likwid.cfg.md
@@ -0,0 +1,38 @@
+/*! \page likwid.cfg <CODE>likwid.cfg</CODE>
+<H1>Information</H1>
+<CODE>likwid.cfg</CODE> is the global configuration file for LIKWID but it is optional. The configuration is normally defined at compile time. It allows to set the path to the access mode for the MSR/PCI access daemon and some other basic options.<BR>
+LIKWID searches for the configuration file at different paths like <CODE>/usr/local/etc/likwid.cfg</CODE>.<BR>
+<B>Note: It was introduced with version 4 and is not fully integrated in the LIKWID code.</B>
+
+<H1>Config file options</H1>
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>topology_file = <path></TD>
+  <TD>Path to the toplogy file created with \ref likwid-genTopoCfg</TD>
+</TR>
+<TR>
+  <TD>access_mode = <daemon|direct></TD>
+  <TD>Set access mode. The direct mode can only used by users with root priviledges. The daemon uses \ref likwid-accessD.</TD>
+</TR>
+<TR>
+  <TD>daemon_path = <path></TD>
+  <TD>Path to the access daemon.</TD>
+</TR>
+<TR>
+  <TD>max_threads = <arg></TD>
+  <TD>Adjust maximally supported threads/CPUs. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+<TR>
+  <TD>max_nodes = <arg></TD>
+  <TD>Adjust maximally supported NUMA nodes. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/logo.png b/doc/logo.png
new file mode 100644
index 0000000..048ed9a
Binary files /dev/null and b/doc/logo.png differ
diff --git a/doc/lua-doxygen.md b/doc/lua-doxygen.md
new file mode 100644
index 0000000..c00b992
--- /dev/null
+++ b/doc/lua-doxygen.md
@@ -0,0 +1,2592 @@
+/*! \page lua_Info Information about LIKWID's Lua API
+<H1>How to include Lua API into own Lua applications</H1>
+<CODE>
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'<BR>
+local likwid = require("likwid")<BR>
+</CODE>
+<P></P>
+Now all function and variables can be called with<BR>
+<CODE>likwid.<I>functionname()</I></CODE><BR>
+or<BR>
+<CODE>likwid.<I>variable</I></CODE>
+
+<H1>Global variables defined by LIKWID's Lua API</H1>
+<TABLE>
+<TR>
+  <TH>Variablename</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>\a groupfolder</TD>
+  <TD>Path to the folder containing the definitions of the performance groups</TD>
+</TR>
+<TR>
+  <TD>\a version</TD>
+  <TD>Version of LIKWID</TD>
+</TR>
+<TR>
+  <TD>\a release</TD>
+  <TD>Release number of LIKWID</TD>
+</TR>
+<TR>
+  <TD>\a pinlibpath</TD>
+  <TD>Path to the pinning library. Is added automatically to $LD_PRELOAD by \ref likwid-pin and \ref likwid-perfctr</TD>
+</TR>
+<TR>
+  <TD>\a hline</TD>
+  <TD>Horizontal line with 80 '-' characters</TD>
+</TR>
+<TR>
+  <TD>\a sline</TD>
+  <TD>Horizontal line with 80 '*' characters</TD>
+</TR>
+<TR>
+  <TD>\a dline</TD>
+  <TD>Horizontal line with 80 '=' characters</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Config Config file module
+<H1>Data type definition for Lua config file module in the Lua API</H1>
+\anchor lua_config
+<H2>Config file read</H2>
+<P>This structure is returned by \ref getConfiguration function<BR>The config file can be created with \ref likwid-genTopoCfg executable. It searches the files /etc/likwid.cfg and <PREFIX>/etc/likwid.cfg. Other configuration file paths can be set in config.mk before building LIKWID.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a configFile</TD>
+  <TD>Path to the config file</TD>
+</TR>
+<TR>
+  <TD>\a topologyFile</TD>
+  <TD>Path to the config file containing topology information</TD>
+</TR>
+<TR>
+  <TD>\a daemonPath</TD>
+  <TD>Path to the access daemon</TD>
+</TR>
+<TR>
+  <TD>\a daemonMode</TD>
+  <TD>Access mode for LIKWID (0 = direct access, 1 = access daemon)</TD>
+</TR>
+<TR>
+  <TD>\a maxNumThreads</TD>
+  <TD>Maximal amount of hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a maxNumNodes</TD>
+  <TD>Maximal amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+  <TD>\a maxHashTableSize</TD>
+  <TD>Maximal size for the internally used hash table</TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua config file module in the Lua API</H1>
+\anchor getConfiguration
+<H2>getConfiguration()</H2>
+<P>Read the configuration file and return a list of config options</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>List of configuration options, see \ref lua_config</TD>
+</TR>
+</TABLE>
+
+\anchor setVerbosity
+<H2>setVerbosity(verbosity)</H2>
+<P>Define and/or change the verbosity level of LIKWID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a verbosity</TD>
+      <TD>0 = only errors<BR>1 = infos<BR>2 = detail<BR>3 = developer<BR>Other flags are rejected.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor putConfiguration
+<H2>putConfiguration()</H2>
+<P>Frees the C-structures that were created by \ref getConfiguration function.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_Access Access client module
+<H1>Data type definition for Lua access client module in the Lua API</H1>
+<H1>Function definitions for Lua access client module in the Lua API</H1>
+\anchor setAccessMode
+<H2>setAccessMode(accessFlag)</H2>
+<P>Define and/or change the access mode to the MSR and PCI registers</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a accessFlag</TD>
+      <TD>0 = direct access<BR>1 = access daemon<BR>Other flags are rejected.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_CPUTopology CPU information module
+<H1>Data type definition for CPU information module in the Lua API</H1>
+\anchor lua_cpuinfo
+<H2>Cpu Info</H2>
+<P>This structure is returned by \ref getCpuInfo function<BR>It is similar to the C struct CpuInfo</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a family</TD>
+  <TD>Family ID of CPU</TD>
+</TR>
+<TR>
+  <TD>\a model</TD>
+  <TD>Model ID of CPU</TD>
+</TR>
+<TR>
+  <TD>\a stepping</TD>
+  <TD>Revision of CPU</TD>
+</TR>
+<TR>
+  <TD>\a clock</TD>
+  <TD>Base clock frequency</TD>
+</TR>
+<TR>
+  <TD>\a turbo</TD>
+  <TD>Flag if the system supports the Turbo mode</TD>
+</TR>
+<TR>
+  <TD>\a name</TD>
+  <TD>Name of the microarchitecture</TD>
+</TR>
+<TR>
+  <TD>\a osname</TD>
+  <TD>Name of the CPU as given by manufacturer</TD>
+</TR>
+<TR>
+  <TD>\a short_name</TD>
+  <TD>Short name of microarchitecture</TD>
+</TR>
+<TR>
+  <TD>\a features</TD>
+  <TD>String with all interesting CPU feature flags as a space separated list</TD>
+</TR>
+<TR>
+  <TD>\a featureFlags</TD>
+  <TD>Bitmask with all interesting CPU feature flags<BR>Bit positions can be retrieved from the FeatureBit enum</TD>
+</TR>
+<TR>
+  <TD>\a isIntel</TD>
+  <TD>Flag to check if the system is using Intel CPUs</TD>
+</TR>
+<TR>
+  <TD>\a perf_version</TD>
+  <TD>Version of architectural performance monitoring capabilities</TD>
+</TR>
+<TR>
+  <TD>\a perf_num_ctr</TD>
+  <TD>Amount of core-local general-purpose counters</TD>
+</TR>
+<TR>
+  <TD>\a perf_num_fixed_ctr</TD>
+  <TD>Amount of core-local fixed-purpose counters</TD>
+</TR>
+<TR>
+  <TD>\a perf_width_ctr</TD>
+  <TD>Register width of core-local counters</TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_cputopo
+<H2>Cpu Topology</H2>
+<P>This structure is returned by \ref getCpuTopology function<BR>The nested list structure is similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numHWThreads</TD>
+  <TD>Total amount of hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a activeHWThreads</TD>
+  <TD>Amount of active hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a numSockets</TD>
+  <TD>Number of CPU sockets in the system</TD>
+</TR>
+<TR>
+  <TD>\a numCoresPerSocket</TD>
+  <TD>Number of physical cores of each socket in the system</TD>
+</TR>
+<TR>
+  <TD>\a numThreadsPerCore</TD>
+  <TD>Number of hardware threads of each core in the system</TD>
+</TR>
+<TR>
+  <TD>\a numCacheLevels</TD>
+  <TD>Amount of cache levels in the system</TD>
+</TR>
+<TR>
+  <TD>\a threadPool<BR>(List with<BR>\a numHWThreads entries)</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a threadId</TD>
+      <TD>Thread ID</TD>
+    </TR>
+    <TR>
+      <TD>\a coreId</TD>
+      <TD>ID of physical CPU core</TD>
+    </TR>
+    <TR>
+      <TD>\a apicId</TD>
+      <TD>ID of the interrupt line for the hardware thread as defined by ACPI</TD>
+    </TR>
+    <TR>
+      <TD>\a packageId</TD>
+      <TD>ID of CPU socket for the current thread</TD>
+    </TR>
+    <TR>
+      <TD>\a inCpuSet</TD>
+      <TD>Defines whether the thread is available in current cpuset</TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+<TR>
+  <TD>\a cacheLevels<BR>(List with<BR>\a numCacheLevels entries)</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a level</TD>
+      <TD>Level of cache</TD>
+    </TR>
+    <TR>
+      <TD>\a associativity</TD>
+      <TD>Associativity in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a sets</TD>
+      <TD>Sets in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a lineSize</TD>
+      <TD>Size of a cache line in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a size</TD>
+      <TD>Size in bytes of cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a threads</TD>
+      <TD>Amount of threads sharing the cache</TD>
+    </TR>
+    <TR>
+      <TD>\a inclusive</TD>
+      <TD>Inclusiveness of cache</TD>
+    </TR>
+    <TR>
+      <TD>\a type</TD>
+      <TD>
+        <TABLE>
+        <TR>
+          <TH>Typename</TH>
+          <TH>comment</TH>
+        </TR>
+        <TR>
+          <TD>DATACACHE</TD>
+          <TD>Cache manages only data</TD>
+        </TR>
+        <TR>
+          <TD>INSTRUCTIONCACHE</TD>
+          <TD>Cache manages only instructions</TD>
+        </TR>
+        <TR>
+          <TD>UNIFIEDCACHE</TD>
+          <TD>Cache manages data and instructions</TD>
+        </TR>
+        <TR>
+          <TD>ITLB</TD>
+          <TD>Translation Lookaside Buffer for instruction page addresses</TD>
+        </TR>
+        <TR>
+          <TD>DTLB</TD>
+          <TD>Translation Lookaside Buffer for data page addresses</TD>
+        </TR>
+        <TR>
+          <TD>NOCACHE</TD>
+          <TD>Type cannot be determined</TD>
+        </TR>
+        </TABLE>
+      </TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+<TR>
+  <TD>\a topologyTree</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a ID</TD>
+      <TD>ID of socket</TD>
+    </TR>
+    <TR>
+      <TD>\a Childs</TD>
+      <TD><TABLE>
+        <TR>
+            <TH>Membername</TH>
+            <TH>Comment</TH>
+        </TR>
+        <TR>
+            <TD>\a ID</TD>
+            <TD>ID of CPU core</TD>
+        </TR>
+        <TR>
+            <TD>\a Childs</TD>
+            <TD>List of thread IDs for the current CPU core</TD>
+        </TR>
+      </TABLE></TD>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+
+<H1>Function definitions for Lua CPU information module in the Lua API</H1>
+\anchor getCpuInfo
+<H2>getCpuInfo()</H2>
+<P>Get basic information about the CPUs in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Cpu Info \ref lua_cpuinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getCpuTopology
+<H2>getCpuTopology()</H2>
+<P>Get the topology information about the CPUs in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>Cpu Topology \ref lua_cputopo</TD>
+</TR>
+</TABLE>
+
+<H2>putTopology()</H2>
+<P>Frees C struct CpuInfo and CpuTopology. You can still use the lua_cpuinfo and lua_cputopo data structures<BR>If you call \ref getCpuInfo or \ref getCpuTopology functions again after calling this function, the topology information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor cpustr_to_cpulist
+<H2>cpustr_to_cpulist(cpuexpression)</H2>
+<P>Resolve the given CPU expression string to a list of CPUs as available in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuexpression</TD>
+      <TD>CPU expression string. Look at \ref likwid-pin for possible formats</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrCPUs</TD>
+      <TD>Number of CPUs in the \a cpulist</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List containing the CPU IDs after resolution of the cpu expression</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>printSupportedCPUs()</H2>
+<P>Print all Intel and AMD CPU types that are supported by Likwid</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+
+/*! \page lua_NumaInfo NUMA memory topology module
+
+<H1>Data type definition for Lua NUMA topology module in the Lua API</H1>
+\anchor lua_numainfo
+<H2>NUMA Info</H2>
+<P>This structure is returned by \ref getNumaInfo function<BR>It is similar to the C struct NumaTopology</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numberOfNodes</TD>
+  <TD>Amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+  <TD>\a nodes</TD>
+    <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>id</TD>
+      <TD>ID of NUMA node</TD>
+    </TR>
+    <TR>
+      <TD>totalMemory</TD>
+      <TD>Total amount of memory in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>freeMemory</TD>
+      <TD>Free amount of memory in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfProcessors</TD>
+      <TD>Amount of CPUs in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfDistances</TD>
+      <TD>Amount of distances to local and remote NUMA nodes</TD>
+    </TR>
+    <TR>
+      <TD>processors</TD>
+      <TD>List of CPU IDs in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>distances</TD>
+      <TD>Two dimensional list of distances to NUMA nodes in the system</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua NUMA topology module in the Lua API</H1>
+\anchor getNumaInfo
+<H2>getNumaInfo()</H2>
+<P>Get information about the NUMA domains in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>NUMA Info \ref lua_numainfo</TD>
+</TR>
+</TABLE>
+
+
+<H2>putNumaInfo()</H2>
+<P>Frees C struct NumaTopology. You can still use the lua_numainfo data structure<BR>If you call \ref getNumaInfo function again after calling this function, the NUMA topology information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>setMemInterleaved(nrThreads, threads2Cpus)</H2>
+<P>Set the 'Interleaved' memory policy to allocate data only on given CPUs</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>nodestr_to_nodelist(nodeexpression)</H2>
+<P>Resolve the given node expression in NUMA affinity domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nodeexpression</TD>
+      <TD>List of CPUs in NUMA node</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>sockstr_to_socklist(socketexpression)</H2>
+<P>Resolve the given socket expression in socket affinity domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a socketexpression</TD>
+      <TD>List of CPUs in socket affinity domain</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_AffinityInfo Thread affinity module
+
+<H1>Data type definition for Lua thread affinity module in the Lua API</H1>
+\anchor lua_affinityinfo
+<H2>Affinity Info</H2>
+<P>This structure is returned by \ref getAffinityInfo function<BR>It is similar to the C struct AffinityDomains</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numberOfAffinityDomains</TD>
+  <TD>Total amount of affinity domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfSocketDomains</TD>
+  <TD>Amount of affinity domains for CPU sockets in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfNumaDomains</TD>
+  <TD>Amount of affinity domains for NUMA domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfCacheDomains</TD>
+  <TD>Amount of affinity domains for LLC domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfProcessorsPerSocket</TD>
+  <TD>Amount of hardware threads for each CPU socket in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfCoresPerCache</TD>
+  <TD>Amount of physical CPU cores for each LLC in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfProcessorsPerCache</TD>
+  <TD>Amount of hardware threads for each LLC in the system</TD>
+</TR>
+<TR>
+  <TD>\a domains</TD>
+    <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>tag</TD>
+      <TD>Tag identifiying the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfCores</TD>
+      <TD>Amount of physical CPU cores in the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfProcessors</TD>
+      <TD>Amount of hardware threads in the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>processorList</TD>
+      <TD>List with hardware thread IDs that are in the affinity domain</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua thread affinity module in the Lua API</H1>
+\anchor getAffinityInfo
+<H2>getAffinityInfo()</H2>
+<P>Get information about the affinity domains in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>NUMA Info \ref lua_affinityinfo</TD>
+</TR>
+</TABLE>
+<H2>putAffinityInfo()</H2>
+<P>Frees C struct AffinityDomains. You can still use the lua_affinityinfo data structure<BR>If you call \ref getAffinityInfo function again after calling this function, the thread affinity information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+\anchor pinProcess
+<H2>pinProcess(cpuID, silent)</H2>
+<P>Pins the current pocess to the given CPU ID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>CPU to pin the process on</TD>
+    </TR>
+    <TR>
+      <TD>\a silent</TD>
+      <TD>Verbosity of pinning method</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_Perfmon Performance monitoring module
+<H1>Data type definition for Lua performance monitoring module in the Lua API</H1>
+\anchor lua_counterinfo
+<H2>Event and Counter Info</H2>
+<P>This structure is returned by \ref getEventsAndCounters function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a Counters</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Counter name as used by LIKWID</TD>
+    </TR>
+    <TR>
+      <TD>Index</TD>
+      <TD>Index of counter definition in internal list of counters</TD>
+    </TR>
+    <TR>
+      <TD>Type</TD>
+      <TD>ID number of counter type, use TypeName to get a human-readable name</TD>
+    </TR>
+    <TR>
+      <TD>TypeName</TD>
+      <TD>Name of counter type</TD>
+    </TR>
+    <TR>
+      <TD>Options</TD>
+      <TD>String with the options available for the counter</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a Events</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Event name as used by LIKWID</TD>
+    </TR>
+    <TR>
+      <TD>ID</TD>
+      <TD>Event ID as defined by CPU vendor</TD>
+    </TR>
+    <TR>
+      <TD>Umask</TD>
+      <TD>Umask further restricting the event defined by ID</TD>
+    </TR>
+    <TR>
+      <TD>Limit</TD>
+      <TD>String containing the name(s) of registers the event can be programmed on</TD>
+    </TR>
+    <TR>
+      <TD>Options</TD>
+      <TD>String with the options available for the event</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor lua_groupdata
+<H2>Info about a performance group</H2>
+<P>This structure is returned by \ref get_groupdata function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>EventString</TD>
+  <TD>Event set used for the performance group. Well formatted for \ref addEventSet function</TD>
+</TR>
+<TR>
+  <TD>GroupString</TD>
+  <TD>Name of the performance group</TD>
+</TR>
+<TR>
+  <TD>LongDescription</TD>
+  <TD>Description of the group. The 'LONG' section in the performance group file</TD>
+</TR>
+<TR>
+  <TD>\a Events</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Event ID</TD>
+      <TD><TABLE>
+      <TR>
+        <TD>\a Event</TD>
+        <TD>Name of event</TD>
+      </TR>
+      <TR>
+        <TD>\a Counter</TD>
+        <TD>LIKWID's name of the counter register</TD>
+      </TR>
+      </TABLE></TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a Metrics</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Metric ID</TD>
+      <TD><TABLE>
+      <TR>
+        <TD>\a description</TD>
+        <TD>Descriptive information of the metric</TD>
+      </TR>
+      <TR>
+        <TD>\a formula</TD>
+        <TD>Formula for calculating the metrics value</TD>
+      </TR>
+      </TABLE></TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_pcidevinfo
+<H2>Info about online PCI devices used for performance monitoring</H2>
+<P>This structure is returned by \ref getOnlineDevices function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a Name (used by LIKWID)</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Name of PCI device</TD>
+    </TR>
+    <TR>
+      <TD>Path</TD>
+      <TD>Path to PCI device</TD>
+    </TR>
+    <TR>
+      <TD>Type</TD>
+      <TD>Human-readable name of the PCI device type</TD>
+    </TR>
+    <TR>
+      <TD>TypeDescription</TD>
+      <TD>Description about the PCI device</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua performance monitoring module in the Lua API</H1>
+\anchor init
+<H2>init(nrThreads, thread2Cpus)</H2>
+<P>Initializes the Perfmon module of LIKWID, like opening the MSR files and check the PCI devices<BR>If in access daemon mode, a single daemon instance is started to forward measurements on all given CPUs</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Number of CPUs that should be measured</TD>
+    </TR>
+    <TR>
+      <TD>\a thread2Cpus</TD>
+      <TD>List with length \a nrThreads containing the relation between thread number and measured CPU</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor addEventSet
+<H2>addEventSet(eventSet)</H2>
+<P>Creates the internal management structures for the given event set. Checks the registers and if needed PCI device access<BR>The \ref init function as to be called previously</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a eventSet</TD>
+      <TD>String composed of all events in the event set. Format is Event1:Counter1(:Option11:Options12:...),Event2:Counter2...</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>The group ID of the added event set</TD>
+</TR>
+</TABLE>
+
+
+\anchor setupCounters
+<H2>setupCounters(groupID)</H2>
+<P>Setup the config registers to measure the events defined by group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>ID of group returned by \ref addEventSet function.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor startCounters
+<H2>startCounters()</H2>
+<P>Starts the perfmon group previously set up with \ref setupCounters function.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor stopCounters
+<H2>stopCounters()</H2>
+<P>Stops the perfmon group and reads the counters into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor readCounters
+<H2>readCounters()</H2>
+<P>Reads the perfmon group into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.<BR>The counters will be stopped shortly and started after reading to exclude the LIKWID code from measurements.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor switchGroup
+<H2>switchGroup(newgroup)</H2>
+<P>Switches the currently active group in the perfmon module. If the given group ID does not exist, it fallbacks to group ID 1.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a newgroup</TD>
+      <TD>Switch active group to \a newgroup</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor finalize
+<H2>finalize()</H2>
+<P>Destroy internal structures and clean all used registers</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+\anchor getResult
+<H2>getResult(groupID, eventID, threadID)</H2>
+<P>Get result for a group, event, thread combination. All options must be given</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return result from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a eventID</TD>
+      <TD>Return result for event with \a eventID. Position in string given to \ref addEventSet function</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Result</TD>
+</TR>
+</TABLE>
+
+\anchor getResults
+<H2>getResults()</H2>
+<P>Get all results for all group, event, thread combinations</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getMarkerResults
+<H2>getMarkerResults(filename, group_list, num_cpus)</H2>
+<P>Get the results for an output file written by \ref MarkerAPI</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a filename</TD>
+      <TD>Filename written by \ref MarkerAPI</TD>
+    </TR>
+    <TR>
+      <TD>\a group_list</TD>
+      <TD>List of defined groups</TD>
+    </TR>
+    <TR>
+      <TD>\a num_cpus</TD>
+      <TD>Amount of defined CPUs. Is used just used for checking if the \ref MarkerAPI run is valid. If LIKWID_MARKER_THREADINIT is not called properly the tests will fail</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Four-dimensional list with results. First dim. is groups, second dim. is management regions, and third dim. are the events and fourth dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getEventsAndCounters
+<H2>getEventsAndCounters()</H2>
+<P>Get a list containing all event and counter definitions</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Event and counter info like \ref lua_counterinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getOnlineDevices
+<H2>getOnlineDevices()</H2>
+<P>Get a list containing all online PCI devices</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PCI device info like \ref lua_pcidevinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfGroups
+<H2>getNumberOfGroups()</H2>
+<P>Returns the number of event sets (groups) added to the perfmon module</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of configured groups</TD>
+</TR>
+</TABLE>
+
+\anchor getIdOfActiveGroup
+<H2>getIdOfActiveGroup()</H2>
+<P>Returns the ID of the currently active group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>ID of active group</TD>
+</TR>
+</TABLE>
+
+\anchor getRuntimeOfGroup
+<H2>getRuntimeOfGroup(groupID)</H2>
+<P>Returns the measurement time of the given groupID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return the measurement time for group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Measurement time of group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfEvents
+<H2>getNumberOfEvents(groupID)</H2>
+<P>Returns the amount of events for the given groupID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return the measurement time for group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of events in group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfThreads
+<H2>getNumberOfThreads()</H2>
+<P>Returns the number of threads as given to \ref init function</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of measurement threads</TD>
+</TR>
+</TABLE>
+
+\anchor get_groups
+<H2>get_groups()</H2>
+<P>Returns a list of all performance groups in \a groupfolder</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a numerOfGroups</TD>
+      <TD>Amount of groups in \a groupfolder for given \a architecture</TD>
+    </TR>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List with the names of all performance groups in \a groupfolder for given \a architecture</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor get_groupdata
+<H2>get_groupdata(group)</H2>
+<P>Read in the performance group \a group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a group</TD>
+      <TD>Get group data for \a group </TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupdata</TD>
+      <TD>Structure with all group information found for the performance group \a group, see \ref lua_groupdata</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_PowerInfo Power and Energy monitoring module
+<H1>Data type definition for Lua power and energy monitoring module in the Lua API</H1>
+\anchor lua_powerinfo
+<H2>Power Information</H2>
+<P>This structure is returned by \ref getPowerInfo function<BR>The nested list structure is almost similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a hasRAPL</TD>
+  <TD>If set, the system supports power readings through the RAPL interface</TD>
+</TR>
+<TR>
+  <TD>\a baseFrequency</TD>
+  <TD>Nominal clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a minFrequency</TD>
+  <TD>Minimal supported clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a powerUnit</TD>
+  <TD>Multiplier for power readings</TD>
+</TR>
+<TR>
+  <TD>\a timeUnit</TD>
+  <TD>Multiplier for time readings from RAPL</TD>
+</TR>
+<TR>
+  <TD>\a turbo</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a numSteps</TD>
+      <TD>Amount of turbo mode steps</TD>
+    </TR>
+    <TR>
+      <TD>\a steps</TD>
+      <TD>List containing the turbo mode steps</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a domains</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a RAPL domain</TD>
+      <TD>
+        <TABLE>
+        <TR>
+          <TH>Typename</TH>
+          <TH>comment</TH>
+        </TR>
+        <TR>
+          <TD>ID</TD>
+          <TD>Type of domain (PKG, PP0, PP1, DRAM)</TD>
+        </TR>
+        <TR>
+          <TD>energyUnit</TD>
+          <TD>Multiplier for energy readings for RAPL domain</TD>
+        </TR>
+        <TR>
+          <TD>supportStatus</TD>
+          <TD>RAPL domain has a status register to read energy values</TD>
+        </TR>
+        <TR>
+          <TD>supportPerf</TD>
+          <TD>RAPL domain has a perf register</TD>
+        </TR>
+        <TR>
+          <TD>supportPolicy</TD>
+          <TD>RAPL domain has a policy register to define a global energy policy</TD>
+        </TR>
+        <TR>
+          <TD>supportLimit</TD>
+          <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+        </TR>
+        <TR>
+          <TD>supportInfo</TD>
+          <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+        </TR>
+        <TR>
+          <TD>tdp</TD>
+          <TD>Thermal Design Power<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>minPower</TD>
+          <TD>Minimal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>maxPower</TD>
+          <TD>Maximal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>maxTimeWindow</TD>
+          <TD>Maximal duration between updates of the RAPL status registers<BR>Only if supportInfo is set</TD>
+        </TR>
+        </TABLE>
+        </TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua power and energy monitoring module in the Lua API</H1>
+\anchor getPowerInfo
+<H2>getPowerInfo()</H2>
+<P>Get information about the RAPL interface in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power Info \ref lua_powerinfo</TD>
+</TR>
+</TABLE>
+\anchor putPowerInfo
+<H2>putPowerInfo()</H2>
+<P>Frees C struct PowerInfo. You can still use the lua_powerinfo data structure<BR>If you call \ref getPowerInfo function again after calling this function, the power information struct will be filled again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor startPower
+<H2>startPower(cpuID, domainID)</H2>
+<P>Start measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Start the power measurement on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Start the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at start</TD>
+</TR>
+</TABLE>
+
+\anchor stopPower
+<H2>stopPower(cpuID, domainID)</H2>
+<P>Stop measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Stop the power measurement on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Stop the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+
+\anchor printEnergy
+<H2>printEnergy(before, after, domainID)</H2>
+<P></P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a before</TD>
+      <TD>Result from \ref startPower function</TD>
+    </TR>
+    <TR>
+      <TD>\a after</TD>
+      <TD>Result from \ref stopPower function</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Print the power result for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+\anchor limitGet
+<H2>limitGet(cpuID, domainID) (EXPERIMENTAL)</H2>
+<P>Get the current limit in the limit register of domain. The limit is defined as maximal power consumption in a time window</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Get limit for CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Get limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a power</TD>
+      <TD>Power limit value</TD>
+    </TR>
+    <TR>
+      <TD>\a time</TD>
+      <TD>Duration of time window</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor limitSet
+<H2>limitSet(cpuID, domainID, power, time, clamp) (EXPERIMENTAL)</H2>
+<P></P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Set limit for CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Set limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+    <TR>
+      <TD>\a power</TD>
+      <TD>Set power value to \a power</TD>
+    </TR>
+    <TR>
+      <TD>\a time</TD>
+      <TD>Set time window value to \a time</TD>
+    </TR>
+    <TR>
+      <TD>\a clamp</TD>
+      <TD>Should the limit be clamped or can it sometimes exceed the power limit if in total the limit is satisfied</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor limitState
+<H2>limitState(cpuID, domainID) (EXPERIMENTAL)</H2>
+<P>Get the state of the limit</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Get the state on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Get the state for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>State, 0 for off, 1 for on</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_ThermalInfo Thermal monitoring module
+<H1>Data type definition for Lua thermal monitoring module in the Lua API</H1>
+<H1>Function definitions for Lua thermal monitoring module in the Lua API</H1>
+\anchor initTemp
+<H2>initTemp(cpuID)</H2>
+<P>Initialize the thermal measurements on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Initialize thermal readings on CPU \a cpuID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor initTemp
+<H2>readTemp(cpuID)</H2>
+<P>Measure the temperature on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Read the temperature on CPU \a cpuID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Temperature</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Timer Time measurement module
+<H1>Data type definition for Lua time measurement module in the Lua API</H1>
+<H1>Function definitions for Lua time measurement module in the Lua API</H1>
+\anchor getCpuClock
+<H2>getCpuClock()</H2>
+<P>Returns the nominal clock speed</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Clock speed in Hz</TD>
+</TR>
+</TABLE>
+
+\anchor startClock
+<H2>startClock()</H2>
+<P>Start the TSC clock</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor stopClock
+<H2>stopClock()</H2>
+<P>Stop the TSC clock</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor getClockCycles
+<H2>getClockCycles(start, stop)</H2>
+<P>Return the amount of cycles between start and stop timestamps</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a start</TD>
+      <TD>Start timestamp</TD>
+    </TR>
+    <TR>
+      <TD>\a stop</TD>
+      <TD>Stop timestamp</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of cycles between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor getClock
+<H2>getClock(start, stop)</H2>
+<P>Return the time in seconds between start and stop timestamps</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a start</TD>
+      <TD>Start timestamp</TD>
+    </TR>
+    <TR>
+      <TD>\a stop</TD>
+      <TD>Stop timestamp</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Time in seconds between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor sleep
+<H2>sleep(usecs)</H2>
+<P>Sleep for specified amount of microseconds</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a usecs</TD>
+      <TD>Sleep for seconds</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Remaining time to sleep. >0 if sleep is interrupted</TD>
+</TR>
+</TABLE>
+
+
+*/
+
+/*! \page lua_MemSweep Memory sweeping module
+<H1>Data type definition for Lua memory sweeping module in the Lua API</H1>
+<H1>Function definitions for Lua memory sweeping module in the Lua API</H1>
+\anchor memSweep
+<H2>memSweep(nrThreads, Cpus)</H2>
+<P>Sweep the memory and LLC for given threads</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a Cpus</TD>
+      <TD>List with thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor memSweepDomain
+<H2>memSweepDomain(domainID)</H2>
+<P>Sweep the memory and LLC for a given NUMA domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Sweep the memory and LLC at the NUMA domain specified by \a domainID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Misc Miscellaneous functions module
+<H1>Data type definition for Lua miscellaneous functions module in the Lua API</H1>
+<H1>Function definitions for Lua miscellaneous functions module in the Lua API</H1>
+\anchor startProgram
+<H2>startProgram(Exec)</H2>
+<P>Start an executable in a new thread</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Exec</TD>
+      <TD>String containing the executable and its options</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID of newly created thread</TD>
+</TR>
+</TABLE>
+
+\anchor checkProgram
+<H2>checkProgram()</H2>
+<P>Check if the executable is running</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor killProgram
+<H2>killProgram(PID)</H2>
+<P>Kill the executable with SIGTERM</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a PID</TD>
+      <TD>PID to send the SIGTERM signal</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor setenv
+<H2>setenv(Name, Value)</H2>
+<P>Set environment variable. Lua only provides getenv()</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Name</TD>
+      <TD>Name of environment variable</TD>
+    </TR>
+    <TR>
+      <TD>\a Value</TD>
+      <TD>Value for the environment variable</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor getpid
+<H2>getpid()</H2>
+<P>Get the PID of the current process</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor access
+<H2>access(Filepath, perm)</H2>
+<P>Check the file existance for a given filepath</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Filepath</TD>
+      <TD>Name of Filepath to check</TD>
+    </TR>
+    <TR>
+      <TD>\a perm</TD>
+      <TD>Check for specified attribute<BR>r: read, w: write, x: executable, e: existance</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor msr_available
+<H2>msr_available()</H2>
+<P>Check whether the msr files are available. Basically checks whether the msr kernel module is loaded properly</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor gethostname
+<H2>gethostname()</H2>
+<P>Returns the hostname of the system in short format</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Hostname</TD>
+      <TD>Hostname in short format</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getjid
+<H2>getjid()</H2>
+<P>Returns the job ID if running in a batch environment. Basically reads the <CODE>PBS_JOBID</CODE> environment variable</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Job ID</TD>
+      <TD>Job ID or 'X' if not in batch environment</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getMPIrank
+<H2>getMPIrank()</H2>
+<P>Returns the MPI rank of the current process. Basically read the <CODE>PMI_RANK</CODE> and <CODE>OMPI_COMM_WORLD_RANK</CODE> environment variables</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a MPI Rank</TD>
+      <TD>MPI rank or 'X' if not in MPI environment</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_InputOutput Input and output functions module
+<H1>Data type definition for Lua output functions module in the Lua API</H1>
+<H1>Function definitions for Lua output functions module in the Lua API</H1>
+\anchor getopt
+<H2>getopt(commandline, optionlist)</H2>
+<P>Read commandline parameters and split them to the given options. The version LIKWID uses was originally taken from the web but extended to talk short '-o' and long options "--option". It returns an iterator for the commandline options.<BR>Basic usage:<BR></P>
+<CODE>
+for opt,arg in likwid.getopt(arg, {"n:","h"}) do<BR>
+    if (type(arg) == "string") then<BR>
+        local s,e = arg:find("-")<BR>
+        if s == 1 then<BR>
+            print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))<BR>
+            print("ERROR: Did you forget an argument to an option?")<BR>
+            os.exit(1)<BR>
+        end<BR>
+    end<BR>
+    --parse options<BR>
+end<BR>
+</CODE><BR>
+The option 'n' takes an argument, specified by the ':'. If found the option argument for option 'h' is true. The type check for the argument is recommended to get errors with an argument awaiting option where the argument is missing.
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a commandline</TD>
+      <TD>Normally, Lua saves the commandline parameters in variable 'arg'</TD>
+    </TR>
+    <TR>
+      <TD>\a optionlist</TD>
+      <TD>List of options that should be recognized. Options with ':' as last character need an argument<BR>Example {"h","v","cpu:"}</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a option</TD>
+      <TD>Option string found on the commandline without leading '-'</TD>
+    </TR>
+    <TR>
+      <TD>\a argument</TD>
+      <TD>Argument to the \a option. If \a option does not require an argument, true or false is returned in \a argument</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor parse_time
+<H2>parse_time(timestr)</H2>
+<P>Parses time interval describing strings like 2s, 100ms or 250us</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a timestr</TD>
+      <TD>String describing a time interval</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a duration</TD>
+      <TD>Time string \a timestr resolved to usecs</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor printtable
+<H2>printtable(table)</H2>
+<P>Prints the given two dimensional table as fancy ASCII table. For CSV output use \ref printcsv</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a table</TD>
+      <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printcsv
+<H2>printcsv(table)</H2>
+<P>Prints the given two dimensional table in CSV format. For ASCII table output see \ref printtable</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a table</TD>
+      <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor stringsplit
+<H2>stringsplit(str, sSeparator,( nMax, bRegexp))</H2>
+<P>Splits the given string at separating character</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a str</TD>
+      <TD>String to split</TD>
+    </TR>
+    <TR>
+      <TD>\a sSeparator</TD>
+      <TD>String with separating character</TD>
+    </TR>
+    <TR>
+      <TD>\a nMax</TD>
+      <TD>Split string maximally \a nMax times (optional)</TD>
+    </TR>
+    <TR>
+      <TD>\a bRegexp</TD>
+      <TD>Lua RegEx string for separation (optional)</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>List of \a str splitted at \a sSeparator or \a bRegexp</TD>
+</TR>
+</TABLE>
+
+\anchor printOutput
+<H2>printOutput(groups, results, groupData, cpulist)</H2>
+<P>Prints results</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List of groups for printing</TD>
+    </TR>
+    <TR>
+      <TD>\a results</TD>
+      <TD>List of results as returned by \ref getResults function</TD>
+    </TR>
+    <TR>
+      <TD>\a groupData</TD>
+      <TD>List of group data structures</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List of thread ID to CPU ID relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor print_markerOutput
+<H2>print_markerOutput(groups, results, groupData, cpulist)</H2>
+<P>Prints results of a Marker API run. This is different to \ref printOutput because we have to resolve the measurement regions</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List of groups for printing</TD>
+    </TR>
+    <TR>
+      <TD>\a results</TD>
+      <TD>List of results as returned by \ref getMarkerResults function</TD>
+    </TR>
+    <TR>
+      <TD>\a groupData</TD>
+      <TD>List of group data structures</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List of thread ID to CPU ID relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor addSimpleAsciiBox
+<H2>addSimpleAsciiBox(container, lineIdx, colIdx, label)</H2>
+<P>Add a simple ASCII box with given label to box container. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+    <TR>
+      <TD>\a lineIdx</TD>
+      <TD>Add box at line index \a lineIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a colIdx</TD>
+      <TD>Add box at column index \a colIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a label</TD>
+      <TD>Content of the box</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor addJoinedAsciiBox
+<H2>addJoinedAsciiBox(container, lineIdx, startColIdx, endColIdx, label)</H2>
+<P>Add a joined ASCII box with given label to box container. Joined boxes can span the space of multiple simple boxes. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+    <TR>
+      <TD>\a lineIdx</TD>
+      <TD>Add box at line index \a lineIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a startColIdx</TD>
+      <TD>Start joined box at column index \a startColIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a endColIdx</TD>
+      <TD>End joined box at column index \a endColIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a label</TD>
+      <TD>Content of the box</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printAsciiBox
+<H2>printAsciiBox(container)</H2>
+<P>Print the box container previously filled with \ref addSimpleAsciiBox and \ref addJoinedAsciiBox. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
diff --git a/examples/C-internalMarkerAPI.c b/examples/C-internalMarkerAPI.c
new file mode 100644
index 0000000..b5a0c4f
--- /dev/null
+++ b/examples/C-internalMarkerAPI.c
@@ -0,0 +1,152 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <omp.h>
+
+#include <likwid.h>
+
+
+void dummy()
+{
+    ;;
+}
+
+int main(int argc, char* argv[])
+{
+    int i, k;
+    char group[] = "L3";
+    int gid = 0;
+    char cpulist[] = "0,1,2";
+    int cpus[3] =  {0,1,2};
+    char filepath[] = "/tmp/test-marker.out";
+    char accessmode[] = "1";
+    double *A, *B;
+    size_t asize = 1024*1024;
+    
+
+    setenv("LIKWID_EVENTS", group, 1);
+    setenv("LIKWID_THREADS", cpulist, 1);
+    setenv("LIKWID_FILEPATH", filepath, 1);
+    setenv("LIKWID_MODE", accessmode, 1);
+    /* If the NMI watchdog is enabled or the application does not call
+     * perfmon_finalize(), e.g. because of some error, LIKWID will fail with
+     * a message "Counter in use". By settings LIKWID_FORCE you can overwrite
+     * the registers.
+     */
+    //setenv("LIKWID_FORCE", "1", 1);
+    
+    A = malloc(asize * sizeof(double));
+    if (A==NULL)
+        return 1;
+    B = malloc(asize * sizeof(double));
+    if (B==NULL)
+    {
+        free(A);
+        return 1;
+    }
+    for (i=0; i<asize;i++)
+        B[i] = ((double)i)+1.5;
+
+    /* This is only for showcase. If your application pins them already, you
+     * don't need this
+     */
+#pragma omp parallel
+{
+    likwid_pinThread(cpus[omp_get_thread_num()]);
+}
+
+    /* Calls perfmon_init() and perfmon_addEventSet */
+    LIKWID_MARKER_INIT;
+    /* Setup and start manually. We use group ID 0, we can switch later */
+    perfmon_setupCounters(0);
+    perfmon_startCounters();
+
+    printf("Getting results during the measurements with LIKWID_MARKER_GET\n");
+#pragma omp parallel private(k,i)
+{
+    int nr_events = 20;
+    double time = 0;
+    int count = 0;
+    double *events = malloc(nr_events * sizeof(double));
+    memset(events, 0, nr_events * sizeof(double));
+    LIKWID_MARKER_START("Total");
+    for (k=0; k<10; k++)
+    {
+        
+        LIKWID_MARKER_START("Calc1");
+#pragma omp for
+        for (i=0; i< asize; i++)
+            A[i] = B[i];
+        if (A[i] < 0) dummy();
+        LIKWID_MARKER_STOP("Calc1");
+    }
+    LIKWID_MARKER_GET("Calc1", &nr_events, events, &time, &count);
+    printf("Calc1 Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+    nr_events = 20;
+    memset(events, 0, nr_events * sizeof(double));
+    for (k=0; k<10; k++)
+    {
+        LIKWID_MARKER_START("Calc2");
+#pragma omp for
+        for (i=0; i< asize; i++)
+            A[i] = A[i] + B[i];
+        if (A[i] < 0) dummy();
+        LIKWID_MARKER_STOP("Calc2");
+    }
+    LIKWID_MARKER_STOP("Total");
+    LIKWID_MARKER_GET("Calc2", &nr_events, events, &time, &count);
+    printf("Calc2 Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+    nr_events = 20;
+    memset(events, 0, nr_events * sizeof(double));
+    LIKWID_MARKER_GET("Total", &nr_events, events, &time, &count);
+    printf("Total Thread %d got %d events, runtime %f s, call count %d\n", omp_get_thread_num(), nr_events, time, count);
+    free(events);
+}
+
+
+
+    perfmon_stopCounters();
+    LIKWID_MARKER_CLOSE;
+
+    
+
+    perfmon_readMarkerFile(filepath);
+    printf("\nMarker API measured %d regions\n", perfmon_getNumberOfRegions());
+    for (i=0;i<perfmon_getNumberOfRegions();i++)
+    {
+        gid = perfmon_getGroupOfRegion(i);
+        printf("Region %s with %d events and %d metrics\n",perfmon_getTagOfRegion(i),
+                                                           perfmon_getEventsOfRegion(i),
+                                                           perfmon_getMetricsOfRegion(i));
+    }
+    printf("\nExample metrics output for thread 0\n");
+    
+    
+    for (i=0;i<perfmon_getNumberOfRegions();i++)
+    {
+        printf("Region %s\n", perfmon_getTagOfRegion(i));
+        for (k=0;k<perfmon_getEventsOfRegion(i);k++)
+            printf("Event %s:%s: %f\n", perfmon_getEventName(gid, k),
+                                        perfmon_getCounterName(gid, k),
+                                        perfmon_getResultOfRegionThread(i, k, 0));
+        for (k=0;k<perfmon_getNumberOfMetrics(gid);k++)
+            printf("Metric %s: %f\n", perfmon_getMetricName(gid, k),
+                                      perfmon_getMetricOfRegionThread(i, k, 0));
+        printf("\n");
+    }
+    remove(filepath);
+    
+    /* Reinitialize access to HPM registers, LIKWID_MARKER_CLOSE closed the connection */
+    HPMinit();
+    for (i=0;i<3; i++)
+        HPMaddThread(cpus[i]);
+    /* Finalize perfmon sets all used counters to zero and deletes marker results, so no
+       perfmon_destroyMarkerResults() required */
+    perfmon_finalize();
+    HPMfinalize();
+    free(A);
+    free(B);
+    return 0;
+
+}
diff --git a/examples/C-likwidAPI.c b/examples/C-likwidAPI.c
new file mode 100644
index 0000000..aa6ed4e
--- /dev/null
+++ b/examples/C-likwidAPI.c
@@ -0,0 +1,149 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  C-likwidAPI.c
+ *
+ *      Description:  Example how to use the LIKWID API in C/C++ applications
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <likwid.h>
+
+
+int main(int argc, char* argv[])
+{
+    int i, j;
+    int err;
+    int* cpus;
+    int gid;
+    double result = 0.0;
+    char estr[] = "L2_LINES_IN_ALL:PMC0,L2_TRANS_L2_WB:PMC1";
+    //perfmon_setVerbosity(3);
+    // Load the topology module and print some values.
+    err = topology_init();
+    if (err < 0)
+    {
+        printf("Failed to initialize LIKWID's topology module\n");
+        return 1;
+    }
+    // CpuInfo_t contains global information like name, CPU family, ...
+    CpuInfo_t info = get_cpuInfo();
+    // CpuTopology_t contains information about the topology of the CPUs.
+    CpuTopology_t topo = get_cpuTopology();
+    // Create affinity domains. Commonly only needed when reading Uncore counters
+    affinity_init();
+
+    printf("Likwid example on a %s with %d CPUs\n", info->name, topo->numHWThreads);
+
+    cpus = (int*)malloc(topo->numHWThreads * sizeof(int));
+    if (!cpus)
+        return 1;
+
+    for (i=0;i<topo->numHWThreads;i++)
+    {
+        cpus[i] = topo->threadPool[i].apicId;
+    }
+
+    // Must be called before perfmon_init() but only if you want to use another
+    // access mode as the pre-configured one. For direct access (0) you have to
+    // be root.
+    //accessClient_setaccessmode(0);
+
+    // Initialize the perfmon module.
+    err = perfmon_init(topo->numHWThreads, cpus);
+    if (err < 0)
+    {
+        printf("Failed to initialize LIKWID's performance monitoring module\n");
+        topology_finalize();
+        return 1;
+    }
+
+    // Add eventset string to the perfmon module.
+    gid = perfmon_addEventSet(estr);
+    if (gid < 0)
+    {
+        printf("Failed to add event string %s to LIKWID's performance monitoring module\n", estr);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+
+    // Setup the eventset identified by group ID (gid).
+    err = perfmon_setupCounters(gid);
+    if (err < 0)
+    {
+        printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+    // Start all counters in the previously set up event set.
+    err = perfmon_startCounters();
+    if (err < 0)
+    {
+        printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+    // Perform something
+    sleep(10);
+    // Stop all counters in the previously started event set.
+    err = perfmon_stopCounters();
+    if (err < 0)
+    {
+        printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+
+
+    // Print the result of every thread/CPU for all events in estr.
+    char* ptr = strtok(estr,",");
+    j = 0;
+    while (ptr != NULL)
+    {
+        for (i = 0;i < topo->numHWThreads; i++)
+        {
+            result = perfmon_getResult(gid, j, i);
+            printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result);
+        }
+        ptr = strtok(NULL,",");
+        j++;
+    }
+
+
+    free(cpus);
+    // Uninitialize the perfmon module.
+    perfmon_finalize();
+    affinity_finalize();
+    // Uninitialize the topology module.
+    topology_finalize();
+    return 0;
+}
diff --git a/examples/C-markerAPI.c b/examples/C-markerAPI.c
new file mode 100644
index 0000000..84f97a4
--- /dev/null
+++ b/examples/C-markerAPI.c
@@ -0,0 +1,87 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  C-markerAPI.c
+ *
+ *      Description:  Example how to use the C/C++ Marker API
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <omp.h>
+#include <likwid.h>
+
+#define SLEEPTIME 2
+
+int main(int argc, char* argv[])
+{
+    int i;
+    int nevents = 10;
+    double events[10];
+    double time;
+    int count;
+    // Init Marker API in serial region once in the beginning
+    LIKWID_MARKER_INIT;
+    #pragma omp parallel
+    {
+        // Each thread must add itself to the Marker API, therefore must be
+        // in parallel region
+        LIKWID_MARKER_THREADINIT;
+        // Optional. Register region name
+        LIKWID_MARKER_REGISTER("example");
+    }
+
+
+    #pragma omp parallel
+    {
+        printf("Thread %d sleeps now for %d seconds\n", omp_get_thread_num(), SLEEPTIME);
+        // Start measurements inside a parallel region
+        LIKWID_MARKER_START("example");
+        // Insert your code here.
+        // Often contains an OpenMP for pragma. Regions can be nested.
+        sleep(SLEEPTIME);
+        // Stop measurements inside a parallel region
+        LIKWID_MARKER_STOP("example");
+        printf("Thread %d wakes up again\n", omp_get_thread_num());
+        // If multiple groups given, you can switch to the next group
+        LIKWID_MARKER_SWITCH;
+        // If you need the performance data inside your application, use
+        LIKWID_MARKER_GET("example", &nevents, events, &time, &count);
+        // where events is an array of doubles with nevents entries,
+        // time is a double* and count an int*.
+        printf("Region example measures %d events, total measurement time is %f\n", nevents, time);
+        printf("The region was called %d times\n", count);
+        for (i = 0; i < nevents; i++)
+        {
+            printf("Event %d: %f\n", i, events[i]);
+        }
+    }
+
+    // Close Marker API and write results to file for further evaluation done
+    // by likwid-perfctr
+    LIKWID_MARKER_CLOSE;
+    return 0;
+}
diff --git a/examples/F-markerAPI.F90 b/examples/F-markerAPI.F90
new file mode 100644
index 0000000..5e2ff4b
--- /dev/null
+++ b/examples/F-markerAPI.F90
@@ -0,0 +1,79 @@
+! =======================================================================================
+!
+!      Filename:  F-markerAPI.F90
+!
+!      Description:  Example how to use the Fortran90 Marker API
+!
+!      Version:   4.1
+!      Released:  19.5.2016
+!
+!      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+!      Project:  likwid
+!
+!      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+!
+!      This program is free software: you can redistribute it and/or modify it under
+!      the terms of the GNU General Public License as published by the Free Software
+!      Foundation, either version 3 of the License, or (at your option) any later
+!      version.
+!
+!      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+!      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+!      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+!
+!      You should have received a copy of the GNU General Public License along with
+!      this program.  If not, see <http://www.gnu.org/licenses/>.
+!
+! =======================================================================================
+
+#define SLEEPTIME 2
+
+program FmarkerAPI
+    use likwid
+    include "omp_lib.h"
+    INTEGER :: nr_events
+    DOUBLE PRECISION, DIMENSION(10) :: events
+    DOUBLE PRECISION :: time
+    INTEGER :: c
+    nr_events = 10
+    ! Init Marker API in serial region once in the beginning.
+    call likwid_markerInit()
+
+!$OMP PARALLEL
+    ! Each thread must add itself to the Marker API, therefore must be
+    ! in parallel region.
+    call likwid_markerthreadInit()
+    ! Optional. Register region name and initialize hash table entries.
+    call likwid_markerRegisterRegion("example")
+!$OMP END PARALLEL
+
+!$OMP PARALLEL
+    print '(a,i0,a,i0,a)', "Thread ", omp_get_thread_num()," sleeps now for ", SLEEPTIME," seconds"
+    ! Start measurements inside a parallel region.
+    call likwid_markerStartRegion("example")
+    ! Insert your code here
+    ! Often contains an OpenMP for pragma. Regions can be nested.
+    call Sleep(SLEEPTIME)
+    ! Stop measurements inside a parallel region.
+    call likwid_markerStopRegion("example")
+    print '(a,i0,a)', "Thread ", omp_get_thread_num()," wakes up again"
+    ! If multiple groups given, you can switch to the next group.
+    call likwid_markerNextGroup();
+    ! If you need the performance data inside your application, use
+    call likwid_markerGetRegion("example", nr_events, events, time, c)
+    ! Events is an array of DOUBLE PRECISION with nr_events (INTEGER) entries,
+    ! time is a DOUBLE PRECISION and count an INTEGER.
+    ! After returning the events array contains maximally nr_events results.
+    print '(a,i0,a,f9.3)', "Region example measures ", nr_events, " events, total measurement time is ", time
+    print '(a,i0,a)', "The region was called ", c, " times"
+    do i=1,nr_events
+        print '(a,i0,a,e13.7)', "Event ",i,": ",events(i)
+    end do
+    
+!$OMP END PARALLEL
+
+! Close Marker API and write results to file for further evaluation done
+! by likwid-perfctr.
+call likwid_markerClose()
+
+end program FmarkerAPI
diff --git a/examples/Lua-likwidAPI.lua b/examples/Lua-likwidAPI.lua
new file mode 100644
index 0000000..a77cdb8
--- /dev/null
+++ b/examples/Lua-likwidAPI.lua
@@ -0,0 +1,93 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+
+ *
+ *      Filename:  Lua-likwidAPI.lua
+ *
+ *      Description:  Example how to use the LIKWID API in Lua scripts
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'
+
+local likwid = require("likwid")
+
+EVENTSET = "INSTR_RETIRED_ANY:FIXC0"
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+
+print(string.format("Likwid example on a %s with %d CPUs", cpuinfo.name, cputopo.numHWThreads))
+
+local cpus = {}
+for i, cpu in pairs(cputopo.threadPool) do
+    table.insert(cpus, cpu.apicId)
+end
+
+if likwid.init(#cpus, cpus) ~= 0 then
+    print("Failed to initialize LIKWID's performance monitoring module")
+    likwid.putTopology()
+    os.exit(1)
+end
+
+local gid = likwid.addEventSet(EVENTSET)
+if gid <= 0 then
+    print(string.format("Failed to add events %s to LIKWID's performance monitoring module", EVENTSET))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+
+
+if likwid.setupCounters(gid) < 0 then
+    printf(string.format("Failed to setup group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+if likwid.startCounters() < 0 then
+    printf(string.format("Failed to start group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+-- Application code
+likwid.sleep(2)
+if likwid.stopCounters() < 0 then
+    printf(string.format("Failed to stop group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+
+
+for i,cpu in pairs(cpus) do
+    result = likwid.getResult(gid, 1, i)
+    print(string.format("Measurement result for event set %s at CPU %d: %f", EVENTSET, cpu, result))
+end
+
+
+likwid.putTopology()
+likwid.finalize()
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..cc21c3c
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,64 @@
+
+include ../config.mk
+include ../make/include_$(COMPILER).mk
+
+LIKWID_INCLUDE ?= -I$(PREFIX)/include
+LIKWID_LIB ?= -L$(PREFIX)/lib -llikwid
+
+all: C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI C-markerAPI-run C-likwidAPI-run F-markerAPI-run Lua-likwidAPI-run
+
+help:
+	@echo "Help message for examples included in LIKWID"
+	@echo
+	@echo "This folder contins examples how you can use the LIKWID API"
+	@echo "Possible examples are:"
+	@echo "- Marker API in C applications: C-markerAPI"
+	@echo "- Marker API in Fortran applications: F-markerAPI"
+	@echo "- Self Monitoring in C applications: C-likwidAPI"
+	@echo "- Using the LIKWID API in Lua scripts: Lua-likwidAPI"
+	@echo "- Monitoring a system with LIKWID: monitoring"
+	@echo
+	@echo "To build an example put the name behind make, e.g. make C-likwidAPI"
+	@echo "To run the built example append '-run' to the name and add it to make: make C-likwidAPI-run"
+
+C-markerAPI:
+	$(CC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-markerAPI.c -o C-markerAPI -llikwid -lm
+
+C-markerAPI-run: C-markerAPI
+	$(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./C-markerAPI
+
+C-likwidAPI:
+	$(CC) -fopenmp -I$(PREFIX)/include -L$(PREFIX)/lib C-likwidAPI.c -o C-likwidAPI -llikwid -lm
+
+C-likwidAPI-run: C-likwidAPI
+	./C-likwidAPI
+
+C-internalMarkerAPI:
+	$(CC) -g -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-internalMarkerAPI.c -o C-internalMarkerAPI -llikwid -lm
+
+C-internalMarkerAPI-run: C-internalMarkerAPI
+	OMP_NUM_THREADS=3 ./C-internalMarkerAPI
+
+monitoring:
+	$(CC) -I$(PREFIX)/include -L$(PREFIX)/lib monitoring.c -o monitoring -llikwid -lm
+
+monitoring-run: monitoring
+	./monitoring
+
+F-markerAPI:
+	$(FC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX) -L$(PREFIX) F-markerAPI.F90 -o F-markerAPI -llikwid -lm
+
+F-markerAPI-run: F-markerAPI
+	$(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./F-markerAPI
+
+Lua-likwidAPI:
+	sed -e "s+<PREFIX>+$(PREFIX)+g" Lua-likwidAPI.lua > Lua-likwidAPI
+	chmod +x Lua-likwidAPI
+
+Lua-likwidAPI-run: Lua-likwidAPI
+	./Lua-likwidAPI
+
+clean:
+	rm -f C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI monitoring C-internalMarkerAPI
+
+.PHONY: clean C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI monitoring C-internalMarkerAPI
diff --git a/examples/monitoring.c b/examples/monitoring.c
new file mode 100644
index 0000000..ddddcb4
--- /dev/null
+++ b/examples/monitoring.c
@@ -0,0 +1,118 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <signal.h>
+#include <likwid.h>
+
+
+static int sleeptime = 1;
+
+static int run = 1;
+
+void  INThandler(int sig)
+{
+    signal(sig, SIG_IGN);
+    run = 0;
+}
+
+
+int main (int argc, char* argv[])
+{
+    int i, c, err = 0;
+    double timer = 0.0;
+    topology_init();
+    numa_init();
+    affinity_init();
+    timer_init();
+    CpuInfo_t cpuinfo = get_cpuInfo();
+    CpuTopology_t cputopo = get_cpuTopology();
+    int numCPUs = cputopo->activeHWThreads;
+    int* cpus = malloc(numCPUs * sizeof(int));
+    if (!cpus)
+    {
+        affinity_finalize();
+        numa_finalize();
+        topology_finalize();
+        return 1;
+    }
+    c = 0;
+    for (i=0;i<cputopo->numHWThreads;i++)
+    {
+        if (cputopo->threadPool[i].inCpuSet)
+        {
+            cpus[c] = cputopo->threadPool[i].apicId;
+            c++;
+        }
+    }
+    NumaTopology_t numa = get_numaTopology();
+    AffinityDomains_t affi = get_affinityDomains();
+    timer = timer_getCpuClock();
+    perfmon_init(numCPUs, cpus);
+    int gid1 = perfmon_addEventSet("L2");
+    if (gid1 < 0)
+    {
+        printf("Failed to add performance group L2\n");
+        err = 1;
+        goto monitor_exit;
+    }
+    int gid2 = perfmon_addEventSet("L3");
+    if (gid2 < 0)
+    {
+        printf("Failed to add performance group L3\n");
+        err = 1;
+        goto monitor_exit;
+    }
+    int gid3 = perfmon_addEventSet("ENERGY");
+    if (gid3 < 0)
+    {
+        printf("Failed to add performance group ENERGY\n");
+        err = 1;
+        goto monitor_exit;
+    }
+    signal(SIGINT, INThandler);
+
+    while (run)
+    {
+        perfmon_setupCounters(gid1);
+        perfmon_startCounters();
+        sleep(sleeptime);
+        perfmon_stopCounters();
+        for (c = 0; c < 8; c++)
+        {
+            for (i = 0; i< perfmon_getNumberOfMetrics(gid1); i++)
+            {
+                printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid1, i), cpus[c], perfmon_getLastMetric(gid1, i, c));
+            }
+        }
+        perfmon_setupCounters(gid2);
+        perfmon_startCounters();
+        sleep(sleeptime);
+        perfmon_stopCounters();
+        for (c = 0; c < 8; c++)
+        {
+            for (i = 0; i< perfmon_getNumberOfMetrics(gid2); i++)
+            {
+                printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid2, i), cpus[c], perfmon_getLastMetric(gid2, i, c));
+            }
+        }
+        perfmon_setupCounters(gid3);
+        perfmon_startCounters();
+        sleep(sleeptime);
+        perfmon_stopCounters();
+        for (c = 0; c < 8; c++)
+        {
+            for (i = 0; i< perfmon_getNumberOfMetrics(gid3); i++)
+            {
+                printf("%s,cpu=%d %f\n", perfmon_getMetricName(gid3, i), cpus[c], perfmon_getLastMetric(gid3, i, c));
+            }
+        }
+    }
+monitor_exit:
+    free(cpus);
+    perfmon_finalize();
+    affinity_finalize();
+    numa_finalize();
+    topology_finalize();
+    return 0;
+}
diff --git a/ext/hwloc/AUTHORS b/ext/hwloc/AUTHORS
new file mode 100644
index 0000000..837b27f
--- /dev/null
+++ b/ext/hwloc/AUTHORS
@@ -0,0 +1,8 @@
+Cédric Augonnet <Cedric.Augonnet at labri.fr>
+Jérôme Clet-Ortega <Jerome.Clet-Ortega at labri.fr>
+Ludovic Courtès <Ludovic.Courtes at inria.fr>
+Brice Goglin <Brice.Goglin at inria.fr>
+Nathalie Furmento <Nathalie.Furmento at labri.fr>
+Samuel Thibault <Samuel.Thibault at labri.fr>
+Jeff Squyres <jsquyres at cisco.com>
+Alexey Kardashevskiy <aik at au1.ibm.com>
diff --git a/ext/hwloc/COPYING b/ext/hwloc/COPYING
new file mode 100644
index 0000000..32128c7
--- /dev/null
+++ b/ext/hwloc/COPYING
@@ -0,0 +1,28 @@
+Copyright © 2009 CNRS
+Copyright © 2009 inria.  All rights reserved.
+Copyright © 2009 Université Bordeaux 1
+Copyright © 2009 Cisco Systems, Inc.  All rights reserved.
+Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+See COPYING in top-level directory.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ext/hwloc/Makefile b/ext/hwloc/Makefile
new file mode 100644
index 0000000..1fd564c
--- /dev/null
+++ b/ext/hwloc/Makefile
@@ -0,0 +1,73 @@
+SRC_DIRS    = ./hwloc
+MAKE_DIR   = ../../make
+
+#DO NOT EDIT BELOW
+
+include ../../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+CFLAGS    = -O2 -Wall -fPIC -fvisibility=hidden
+INCLUDES  += -I./include
+#DEFINES   =
+LIBS      = -L. -lm
+LFLAGS    = -fPIC -fvisibility=hidden
+Q         ?= @
+DEFINES := $(filter-out -DVERSION=$(VERSION),$(DEFINES))
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+else
+DEBUG_FLAGS =
+endif
+ifeq ($(COMPILER),MIC)
+CFLAGS += -mmic
+LFLAGS += -mmic
+endif
+ifeq ($(COMPILER),GCC)
+CFLAGS += -Wno-unused-result
+LFLAGS += -Wno-unused-result
+endif
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+
+VPATH     = $(SRC_DIRS)
+FILES     = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
+OBJ       = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
+LIBHWLOC = $(shell basename $(TARGET_HWLOC_LIB))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(LIBHWLOC)
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+
+$(STATIC_LIBHWLOC): $(OBJ)
+	$(Q)${AR} -cq $(LIBHWLOC) $(OBJ)
+
+$(SHARED_LIBHWLOC): $(OBJ)
+	$(Q)$(CC) $(DEBUG_FLAGS) $(LFLAGS) -Wl,-soname,$(LIBHWLOC).$(VERSION).$(RELEASE)  -Wall -shared -fPIC -o $(LIBHWLOC) $(OBJ) $(LIBS) $(RPATHS)
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean
+
+clean:
+	@rm -f $(TARGET) $(SHARED_LIBHWLOC) $(STATIC_LIBHWLOC) $(LIBHWLOC).$(VERSION).$(RELEASE) $(LIBHWLOC).$(VERSION)
+
+distclean: clean
+	@rm -f $(TARGET) $(SHARED_LIBHWLOC) $(STATIC_LIBHWLOC) $(LIBHWLOC).$(VERSION).$(RELEASE) $(LIBHWLOC).$(VERSION)
+	@rm -rf $(BUILD_DIR)
+
+
+
diff --git a/ext/hwloc/hwloc/base64.c b/ext/hwloc/hwloc/base64.c
new file mode 100644
index 0000000..7a3392f
--- /dev/null
+++ b/ext/hwloc/hwloc/base64.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2012 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ * Modifications after import:
+ * - removed all #if
+ * - updated prototypes
+ * - updated #include
+ */
+
+/*	$OpenBSD: base64.c,v 1.5 2006/10/21 09:55:03 otto Exp $	*/
+
+/*
+ * Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+/*
+ * Portions Copyright (c) 1995 by International Business Machines, Inc.
+ *
+ * International Business Machines, Inc. (hereinafter called IBM) grants
+ * permission under its copyrights to use, copy, modify, and distribute this
+ * Software with or without fee, provided that the above copyright notice and
+ * all paragraphs of this notice appear in all copies, and that the name of IBM
+ * not be used in connection with the marketing of any product incorporating
+ * the Software or modifications thereof, without specific, written prior
+ * permission.
+ *
+ * To the extent it has a right to do so, IBM grants an immunity from suit
+ * under its patents, if any, for the use, sale or manufacture of products to
+ * the extent that such products are used for performing Domain Name System
+ * dynamic updates in TCP/IP networks by means of the Software.  No immunity is
+ * granted for any product per se or for any other function of any product.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", AND IBM DISCLAIMS ALL WARRANTIES,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE.  IN NO EVENT SHALL IBM BE LIABLE FOR ANY SPECIAL,
+ * DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE, EVEN
+ * IF IBM IS APPRISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/* OPENBSD ORIGINAL: lib/libc/net/base64.c */
+
+static const char Base64[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char Pad64 = '=';
+
+/* (From RFC1521 and draft-ietf-dnssec-secext-03.txt)
+   The following encoding technique is taken from RFC 1521 by Borenstein
+   and Freed.  It is reproduced here in a slightly edited form for
+   convenience.
+
+   A 65-character subset of US-ASCII is used, enabling 6 bits to be
+   represented per printable character. (The extra 65th character, "=",
+   is used to signify a special processing function.)
+
+   The encoding process represents 24-bit groups of input bits as output
+   strings of 4 encoded characters. Proceeding from left to right, a
+   24-bit input group is formed by concatenating 3 8-bit input groups.
+   These 24 bits are then treated as 4 concatenated 6-bit groups, each
+   of which is translated into a single digit in the base64 alphabet.
+
+   Each 6-bit group is used as an index into an array of 64 printable
+   characters. The character referenced by the index is placed in the
+   output string.
+
+                         Table 1: The Base64 Alphabet
+
+      Value Encoding  Value Encoding  Value Encoding  Value Encoding
+          0 A            17 R            34 i            51 z
+          1 B            18 S            35 j            52 0
+          2 C            19 T            36 k            53 1
+          3 D            20 U            37 l            54 2
+          4 E            21 V            38 m            55 3
+          5 F            22 W            39 n            56 4
+          6 G            23 X            40 o            57 5
+          7 H            24 Y            41 p            58 6
+          8 I            25 Z            42 q            59 7
+          9 J            26 a            43 r            60 8
+         10 K            27 b            44 s            61 9
+         11 L            28 c            45 t            62 +
+         12 M            29 d            46 u            63 /
+         13 N            30 e            47 v
+         14 O            31 f            48 w         (pad) =
+         15 P            32 g            49 x
+         16 Q            33 h            50 y
+
+   Special processing is performed if fewer than 24 bits are available
+   at the end of the data being encoded.  A full encoding quantum is
+   always completed at the end of a quantity.  When fewer than 24 input
+   bits are available in an input group, zero bits are added (on the
+   right) to form an integral number of 6-bit groups.  Padding at the
+   end of the data is performed using the '=' character.
+
+   Since all base64 input is an integral number of octets, only the
+         -------------------------------------------------
+   following cases can arise:
+
+       (1) the final quantum of encoding input is an integral
+           multiple of 24 bits; here, the final unit of encoded
+	   output will be an integral multiple of 4 characters
+	   with no "=" padding,
+       (2) the final quantum of encoding input is exactly 8 bits;
+           here, the final unit of encoded output will be two
+	   characters followed by two "=" padding characters, or
+       (3) the final quantum of encoding input is exactly 16 bits;
+           here, the final unit of encoded output will be three
+	   characters followed by one "=" padding character.
+   */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <private/private.h>
+
+int
+hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize)
+{
+	size_t datalength = 0;
+	unsigned char input[3];
+	unsigned char output[4];
+	unsigned int i;
+
+	while (2 < srclength) {
+		input[0] = *src++;
+		input[1] = *src++;
+		input[2] = *src++;
+		srclength -= 3;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+		output[3] = input[2] & 0x3f;
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		target[datalength++] = Base64[output[2]];
+		target[datalength++] = Base64[output[3]];
+	}
+
+	/* Now we worry about padding. */
+	if (0 != srclength) {
+		/* Get what's left. */
+		input[0] = input[1] = input[2] = '\0';
+		for (i = 0; i < srclength; i++)
+			input[i] = *src++;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		if (srclength == 1)
+			target[datalength++] = Pad64;
+		else
+			target[datalength++] = Base64[output[2]];
+		target[datalength++] = Pad64;
+	}
+	if (datalength >= targsize)
+		return (-1);
+	target[datalength] = '\0';	/* Returned value doesn't count \0. */
+	return (datalength);
+}
+
+/* skips all whitespace anywhere.
+   converts characters, four at a time, starting at (or after)
+   src from base - 64 numbers into three 8 bit bytes in the target area.
+   it returns the number of data bytes stored at the target, or -1 on error.
+ */
+
+int
+hwloc_decode_from_base64(char const *src, char *target, size_t targsize)
+{
+	unsigned int tarindex, state;
+	int ch;
+	char *pos;
+
+	state = 0;
+	tarindex = 0;
+
+	while ((ch = *src++) != '\0') {
+		if (isspace(ch))	/* Skip whitespace anywhere. */
+			continue;
+
+		if (ch == Pad64)
+			break;
+
+		pos = strchr(Base64, ch);
+		if (pos == 0) 		/* A non-base64 character. */
+			return (-1);
+
+		switch (state) {
+		case 0:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] = (pos - Base64) << 2;
+			}
+			state = 1;
+			break;
+		case 1:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 4;
+				target[tarindex+1]  = ((pos - Base64) & 0x0f)
+							<< 4 ;
+			}
+			tarindex++;
+			state = 2;
+			break;
+		case 2:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 2;
+				target[tarindex+1]  = ((pos - Base64) & 0x03)
+							<< 6;
+			}
+			tarindex++;
+			state = 3;
+			break;
+		case 3:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] |= (pos - Base64);
+			}
+			tarindex++;
+			state = 0;
+			break;
+		}
+	}
+
+	/*
+	 * We are done decoding Base-64 chars.  Let's see if we ended
+	 * on a byte boundary, and/or with erroneous trailing characters.
+	 */
+
+	if (ch == Pad64) {		/* We got a pad char. */
+		ch = *src++;		/* Skip it, get next. */
+		switch (state) {
+		case 0:		/* Invalid = in first position */
+		case 1:		/* Invalid = in second position */
+			return (-1);
+
+		case 2:		/* Valid, means one byte of info */
+			/* Skip any number of spaces. */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					break;
+			/* Make sure there is another trailing = sign. */
+			if (ch != Pad64)
+				return (-1);
+			ch = *src++;		/* Skip the = */
+			/* Fall through to "single trailing =" case. */
+			/* FALLTHROUGH */
+
+		case 3:		/* Valid, means two bytes of info */
+			/*
+			 * We know this char is an =.  Is there anything but
+			 * whitespace after it?
+			 */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					return (-1);
+
+			/*
+			 * Now make sure for cases 2 and 3 that the "extra"
+			 * bits that slopped past the last full byte were
+			 * zeros.  If we don't check them, they become a
+			 * subliminal channel.
+			 */
+			if (target && target[tarindex] != 0)
+				return (-1);
+		}
+	} else {
+		/*
+		 * We ended by seeing the end of the string.  Make sure we
+		 * have no partial bytes lying around.
+		 */
+		if (state != 0)
+			return (-1);
+	}
+
+	return (tarindex);
+}
diff --git a/ext/hwloc/hwloc/bind.c b/ext/hwloc/hwloc/bind.c
new file mode 100644
index 0000000..e2b5a06
--- /dev/null
+++ b/ext/hwloc/hwloc/bind.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2011 inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <hwloc/helper.h>
+#ifdef HAVE_SYS_MMAN_H
+#  include <sys/mman.h>
+#endif
+/* <malloc.h> is only needed if we don't have posix_memalign() */
+#if defined(hwloc_getpagesize) && !defined(HAVE_POSIX_MEMALIGN) && defined(HAVE_MEMALIGN) && defined(HAVE_MALLOC_H)
+#include <malloc.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdlib.h>
+#include <errno.h>
+
+/* TODO: HWLOC_GNU_SYS, HWLOC_IRIX_SYS,
+ *
+ * IRIX: see MP_MUSTRUN / _DSM_MUSTRUN, pthread_setrunon_np, /hw, procss_cpulink, numa_create
+ *
+ * We could use glibc's sched_setaffinity generically when it is available
+ *
+ * Darwin and OpenBSD don't seem to have binding facilities.
+ */
+
+static hwloc_const_bitmap_t
+hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+
+  if (!topology_set) {
+    /* The topology is composed of several systems, the cpuset is ambiguous. */
+    errno = EXDEV;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_iszero(set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(set, complete_set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, set))
+    set = complete_set;
+
+  return set;
+}
+
+int
+hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_cpubind)
+      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_cpubind)
+      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+    else if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_cpubind)
+      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_cpubind)
+      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+    else if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_cpubind)
+    return topology->binding_hooks.set_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_proc_cpubind)
+    return topology->binding_hooks.get_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+#ifdef hwloc_thread_t
+int
+hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_thread_cpubind)
+    return topology->binding_hooks.set_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_thread_cpubind)
+    return topology->binding_hooks.get_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+#endif
+
+int
+hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location)
+      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location)
+      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+    else if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_proc_last_cpu_location)
+    return topology->binding_hooks.get_proc_last_cpu_location(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+static hwloc_const_nodeset_t
+hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (!hwloc_topology_get_topology_cpuset(topology)) {
+    /* The topology is composed of several systems, the nodeset is thus
+     * ambiguous. */
+    errno = EXDEV;
+    return NULL;
+  }
+
+  if (!complete_nodeset) {
+    /* There is no NUMA node */
+    errno = ENODEV;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_iszero(nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(nodeset, complete_nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_nodeset, nodeset))
+    return complete_nodeset;
+
+  return nodeset;
+}
+
+static int
+hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_const_cpuset_t cpuset)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (!topology_set) {
+    /* The topology is composed of several systems, the cpuset is thus
+     * ambiguous. */
+    errno = EXDEV;
+    return -1;
+  }
+
+  if (!complete_nodeset) {
+    /* There is no NUMA node */
+    errno = ENODEV;
+    return -1;
+  }
+
+  if (hwloc_bitmap_iszero(cpuset)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!hwloc_bitmap_isincluded(cpuset, complete_set)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, cpuset)) {
+    hwloc_bitmap_copy(nodeset, complete_nodeset);
+    return 0;
+  }
+
+  hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+  return 0;
+}
+
+int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_membind)
+      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_membind)
+      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+    else if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_membind)
+      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_membind)
+      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+    else if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_membind_nodeset(topology, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_membind)
+    return topology->binding_hooks.set_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+
+int
+hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (topology->binding_hooks.get_proc_membind)
+    return topology->binding_hooks.get_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_area_membind)
+    return topology->binding_hooks.set_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (topology->binding_hooks.get_area_membind)
+    return topology->binding_hooks.get_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+void *
+hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  void *p;
+#if defined(hwloc_getpagesize) && defined(HAVE_POSIX_MEMALIGN)
+  errno = posix_memalign(&p, hwloc_getpagesize(), len);
+  if (errno)
+    p = NULL;
+#elif defined(hwloc_getpagesize) && defined(HAVE_MEMALIGN)
+  p = memalign(hwloc_getpagesize(), len);
+#else
+  p = malloc(len);
+#endif
+  return p;
+}
+
+#ifdef MAP_ANONYMOUS
+void *
+hwloc_alloc_mmap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  return mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+}
+#endif
+
+int
+hwloc_free_heap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+#ifdef MAP_ANONYMOUS
+int
+hwloc_free_mmap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len)
+{
+  if (!addr)
+    return 0;
+  return munmap(addr, len);
+}
+#endif
+
+void *
+hwloc_alloc(hwloc_topology_t topology, size_t len)
+{
+  if (topology->binding_hooks.alloc)
+    return topology->binding_hooks.alloc(topology, len);
+  return hwloc_alloc_heap(topology, len);
+}
+
+void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *p;
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    goto fallback;
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+    errno = EINVAL;
+    goto fallback;
+  }
+
+  if (topology->binding_hooks.alloc_membind)
+    return topology->binding_hooks.alloc_membind(topology, len, nodeset, policy, flags);
+  else if (topology->binding_hooks.set_area_membind) {
+    p = hwloc_alloc(topology, len);
+    if (!p)
+      return NULL;
+    if (topology->binding_hooks.set_area_membind(topology, p, len, nodeset, policy, flags) && flags & HWLOC_MEMBIND_STRICT) {
+      int error = errno;
+      free(p);
+      errno = error;
+      return NULL;
+    }
+    return p;
+  } else {
+    errno = ENOSYS;
+  }
+
+fallback:
+  if (flags & HWLOC_MEMBIND_STRICT)
+    /* Report error */
+    return NULL;
+  /* Never mind, allocate anyway */
+  return hwloc_alloc(topology, len);
+}
+
+void *
+hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  void *ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
+    if (flags & HWLOC_MEMBIND_STRICT)
+      ret = NULL;
+    else
+      ret = hwloc_alloc(topology, len);
+  } else
+    ret = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
+{
+  if (topology->binding_hooks.free_membind)
+    return topology->binding_hooks.free_membind(topology, addr, len);
+  return hwloc_free_heap(topology, addr, len);
+}
+
+/*
+ * Empty binding hooks always returning success
+ */
+
+static int dontset_return_complete_cpuset(hwloc_topology_t topology, hwloc_cpuset_t set)
+{
+  hwloc_const_cpuset_t cpuset = hwloc_topology_get_complete_cpuset(topology);
+  if (cpuset) {
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+    return 0;
+  } else
+    return -1;
+}
+
+static int dontset_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#ifdef hwloc_thread_t
+static int dontset_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#endif
+
+static int dontset_return_complete_nodeset(hwloc_topology_t topology, hwloc_nodeset_t set, hwloc_membind_policy_t *policy)
+{
+  hwloc_const_nodeset_t nodeset = hwloc_topology_get_complete_nodeset(topology);
+  if (nodeset) {
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
+    *policy = HWLOC_MEMBIND_DEFAULT;
+    return 0;
+  } else
+    return -1;
+}
+
+static int dontset_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static void * dontalloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return malloc(size);
+}
+static int dontfree_membind(hwloc_topology_t topology __hwloc_attribute_unused, void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+static void hwloc_set_dummy_hooks(struct hwloc_binding_hooks *hooks,
+				  struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisproc_cpubind = dontset_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = dontget_thisproc_cpubind;
+  hooks->set_thisthread_cpubind = dontset_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = dontget_thisthread_cpubind;
+  hooks->set_proc_cpubind = dontset_proc_cpubind;
+  hooks->get_proc_cpubind = dontget_proc_cpubind;
+#ifdef hwloc_thread_t
+  hooks->set_thread_cpubind = dontset_thread_cpubind;
+  hooks->get_thread_cpubind = dontget_thread_cpubind;
+#endif
+  hooks->get_thisproc_last_cpu_location = dontget_thisproc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_thisthread_last_cpu_location = dontget_thisthread_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_proc_last_cpu_location = dontget_proc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  /* TODO: get_thread_last_cpu_location */
+  hooks->set_thisproc_membind = dontset_thisproc_membind;
+  hooks->get_thisproc_membind = dontget_thisproc_membind;
+  hooks->set_thisthread_membind = dontset_thisthread_membind;
+  hooks->get_thisthread_membind = dontget_thisthread_membind;
+  hooks->set_proc_membind = dontset_proc_membind;
+  hooks->get_proc_membind = dontget_proc_membind;
+  hooks->set_area_membind = dontset_area_membind;
+  hooks->get_area_membind = dontget_area_membind;
+  hooks->alloc_membind = dontalloc_membind;
+  hooks->free_membind = dontfree_membind;
+}
+
+void
+hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support)
+{
+#    ifdef HWLOC_LINUX_SYS
+    hwloc_set_linuxfs_hooks(hooks, support);
+#    endif /* HWLOC_LINUX_SYS */
+
+#    ifdef HWLOC_BGQ_SYS
+    hwloc_set_bgq_hooks(hooks, support);
+#    endif /* HWLOC_BGQ_SYS */
+
+#    ifdef HWLOC_AIX_SYS
+    hwloc_set_aix_hooks(hooks, support);
+#    endif /* HWLOC_AIX_SYS */
+
+#    ifdef HWLOC_OSF_SYS
+    hwloc_set_osf_hooks(hooks, support);
+#    endif /* HWLOC_OSF_SYS */
+
+#    ifdef HWLOC_SOLARIS_SYS
+    hwloc_set_solaris_hooks(hooks, support);
+#    endif /* HWLOC_SOLARIS_SYS */
+
+#    ifdef HWLOC_WIN_SYS
+    hwloc_set_windows_hooks(hooks, support);
+#    endif /* HWLOC_WIN_SYS */
+
+#    ifdef HWLOC_DARWIN_SYS
+    hwloc_set_darwin_hooks(hooks, support);
+#    endif /* HWLOC_DARWIN_SYS */
+
+#    ifdef HWLOC_FREEBSD_SYS
+    hwloc_set_freebsd_hooks(hooks, support);
+#    endif /* HWLOC_FREEBSD_SYS */
+
+#    ifdef HWLOC_NETBSD_SYS
+    hwloc_set_netbsd_hooks(hooks, support);
+#    endif /* HWLOC_NETBSD_SYS */
+
+#    ifdef HWLOC_HPUX_SYS
+    hwloc_set_hpux_hooks(hooks, support);
+#    endif /* HWLOC_HPUX_SYS */
+}
+
+/* If the represented system is actually not this system, use dummy binding hooks. */
+void
+hwloc_set_binding_hooks(struct hwloc_topology *topology)
+{
+  if (topology->is_thissystem) {
+    hwloc_set_native_binding_hooks(&topology->binding_hooks, &topology->support);
+    /* every hook not set above will return ENOSYS */
+  } else {
+    /* not this system, use dummy binding hooks that do nothing (but don't return ENOSYS) */
+    hwloc_set_dummy_hooks(&topology->binding_hooks, &topology->support);
+  }
+
+  /* if not is_thissystem, set_cpubind is fake
+   * and get_cpubind returns the whole system cpuset,
+   * so don't report that set/get_cpubind as supported
+   */
+  if (topology->is_thissystem) {
+#define DO(which,kind) \
+    if (topology->binding_hooks.kind) \
+      topology->support.which##bind->kind = 1;
+    DO(cpu,set_thisproc_cpubind);
+    DO(cpu,get_thisproc_cpubind);
+    DO(cpu,set_proc_cpubind);
+    DO(cpu,get_proc_cpubind);
+    DO(cpu,set_thisthread_cpubind);
+    DO(cpu,get_thisthread_cpubind);
+#ifdef hwloc_thread_t
+    DO(cpu,set_thread_cpubind);
+    DO(cpu,get_thread_cpubind);
+#endif
+    DO(cpu,get_thisproc_last_cpu_location);
+    DO(cpu,get_proc_last_cpu_location);
+    DO(cpu,get_thisthread_last_cpu_location);
+    DO(mem,set_thisproc_membind);
+    DO(mem,get_thisproc_membind);
+    DO(mem,set_thisthread_membind);
+    DO(mem,get_thisthread_membind);
+    DO(mem,set_proc_membind);
+    DO(mem,get_proc_membind);
+    DO(mem,set_area_membind);
+    DO(mem,get_area_membind);
+    DO(mem,alloc_membind);
+  }
+}
diff --git a/ext/hwloc/hwloc/bitmap.c b/ext/hwloc/hwloc/bitmap.c
new file mode 100644
index 0000000..e2b807a
--- /dev/null
+++ b/ext/hwloc/hwloc/bitmap.c
@@ -0,0 +1,1492 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <hwloc/bitmap.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <ctype.h>
+
+/*
+ * possible improvements:
+ * - have a way to change the initial allocation size:
+ *   add hwloc_bitmap_set_foo() to changes a global here,
+ *   and make the hwloc core call based on the early number of PUs
+ * - preallocate inside the bitmap structure (so that the whole structure is a cacheline for instance)
+ *   and allocate a dedicated array only later when reallocating larger
+ * - add a bitmap->ulongs_empty_first which guarantees that some first ulongs are empty,
+ *   making tests much faster for big bitmaps since there's no need to look at first ulongs.
+ *   no need for ulongs_empty_first to be exactly the max number of empty ulongs,
+ *   clearing bits that were set earlier isn't very common.
+ */
+
+/* magic number */
+#define HWLOC_BITMAP_MAGIC 0x20091007
+
+/* actual opaque type internals */
+struct hwloc_bitmap_s {
+  unsigned ulongs_count; /* how many ulong bitmasks are valid, >= 1 */
+  unsigned ulongs_allocated; /* how many ulong bitmasks are allocated, >= ulongs_count */
+  unsigned long *ulongs;
+  int infinite; /* set to 1 if all bits beyond ulongs are set */
+#ifdef HWLOC_DEBUG
+  int magic;
+#endif
+};
+
+/* overzealous check in debug-mode, not as powerful as valgrind but still useful */
+#ifdef HWLOC_DEBUG
+#define HWLOC__BITMAP_CHECK(set) do {				\
+  assert((set)->magic == HWLOC_BITMAP_MAGIC);			\
+  assert((set)->ulongs_count >= 1);				\
+  assert((set)->ulongs_allocated >= (set)->ulongs_count);	\
+} while (0)
+#else
+#define HWLOC__BITMAP_CHECK(set)
+#endif
+
+/* extract a subset from a set using an index or a cpu */
+#define HWLOC_SUBBITMAP_INDEX(cpu)		((cpu)/(HWLOC_BITS_PER_LONG))
+#define HWLOC_SUBBITMAP_CPU_ULBIT(cpu)		((cpu)%(HWLOC_BITS_PER_LONG))
+/* Read from a bitmap ulong without knowing whether x is valid.
+ * Writers should make sure that x is valid and modify set->ulongs[x] directly.
+ */
+#define HWLOC_SUBBITMAP_READULONG(set,x)	((x) < (set)->ulongs_count ? (set)->ulongs[x] : (set)->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO)
+
+/* predefined subset values */
+#define HWLOC_SUBBITMAP_ZERO			0UL
+#define HWLOC_SUBBITMAP_FULL			(~0UL)
+#define HWLOC_SUBBITMAP_ULBIT(bit)		(1UL<<(bit))
+#define HWLOC_SUBBITMAP_CPU(cpu)		HWLOC_SUBBITMAP_ULBIT(HWLOC_SUBBITMAP_CPU_ULBIT(cpu))
+#define HWLOC_SUBBITMAP_ULBIT_TO(bit)		(HWLOC_SUBBITMAP_FULL>>(HWLOC_BITS_PER_LONG-1-(bit)))
+#define HWLOC_SUBBITMAP_ULBIT_FROM(bit)		(HWLOC_SUBBITMAP_FULL<<(bit))
+#define HWLOC_SUBBITMAP_ULBIT_FROMTO(begin,end)	(HWLOC_SUBBITMAP_ULBIT_TO(end) & HWLOC_SUBBITMAP_ULBIT_FROM(begin))
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc(void)
+{
+  struct hwloc_bitmap_s * set;
+
+  set = malloc(sizeof(struct hwloc_bitmap_s));
+  if (!set)
+    return NULL;
+
+  set->ulongs_count = 1;
+  set->ulongs_allocated = 64/sizeof(unsigned long);
+  set->ulongs = malloc(64);
+  if (!set->ulongs) {
+    free(set);
+    return NULL;
+  }
+
+  set->ulongs[0] = HWLOC_SUBBITMAP_ZERO;
+  set->infinite = 0;
+#ifdef HWLOC_DEBUG
+  set->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return set;
+}
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc_full(void)
+{
+  struct hwloc_bitmap_s * set = hwloc_bitmap_alloc();
+  if (set) {
+    set->infinite = 1;
+    set->ulongs[0] = HWLOC_SUBBITMAP_FULL;
+  }
+  return set;
+}
+
+void hwloc_bitmap_free(struct hwloc_bitmap_s * set)
+{
+  if (!set)
+    return;
+
+  HWLOC__BITMAP_CHECK(set);
+#ifdef HWLOC_DEBUG
+  set->magic = 0;
+#endif
+
+  free(set->ulongs);
+  free(set);
+}
+
+/* enlarge until it contains at least needed_count ulongs.
+ */
+static void
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned tmp = 1 << hwloc_flsl((unsigned long) needed_count - 1);
+  if (tmp > set->ulongs_allocated) {
+    set->ulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
+    assert(set->ulongs);
+    set->ulongs_allocated = tmp;
+  }
+}
+
+/* enlarge until it contains at least needed_count ulongs,
+ * and update new ulongs according to the infinite field.
+ */
+static void
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  if (needed_count <= set->ulongs_count)
+    return;
+
+  /* realloc larger if needed */
+  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+
+  /* fill the newly allocated subset depending on the infinite flag */
+  for(i=set->ulongs_count; i<needed_count; i++)
+    set->ulongs[i] = set->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+  set->ulongs_count = needed_count;
+}
+
+/* realloc until it contains at least cpu+1 bits */
+#define hwloc_bitmap_realloc_by_cpu_index(set, cpu) hwloc_bitmap_realloc_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+/* reset a bitmap to exactely the needed size.
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+static void
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+  set->ulongs_count = needed_count;
+}
+
+/* reset until it contains exactly cpu+1 bits (roundup to a ulong).
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+#define hwloc_bitmap_reset_by_cpu_index(set, cpu) hwloc_bitmap_reset_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+{
+  struct hwloc_bitmap_s * new;
+
+  if (!old)
+    return NULL;
+
+  HWLOC__BITMAP_CHECK(old);
+
+  new = malloc(sizeof(struct hwloc_bitmap_s));
+  if (!new)
+    return NULL;
+
+  new->ulongs = malloc(old->ulongs_allocated * sizeof(unsigned long));
+  if (!new->ulongs) {
+    free(new);
+    return NULL;
+  }
+  new->ulongs_allocated = old->ulongs_allocated;
+  new->ulongs_count = old->ulongs_count;
+  memcpy(new->ulongs, old->ulongs, new->ulongs_count * sizeof(unsigned long));
+  new->infinite = old->infinite;
+#ifdef HWLOC_DEBUG
+  new->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return new;
+}
+
+void hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
+{
+  HWLOC__BITMAP_CHECK(dst);
+  HWLOC__BITMAP_CHECK(src);
+
+  hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count);
+
+  memcpy(dst->ulongs, src->ulongs, src->ulongs_count * sizeof(unsigned long));
+  dst->infinite = src->infinite;
+}
+
+/* Strings always use 32bit groups */
+#define HWLOC_PRIxSUBBITMAP		"%08lx"
+#define HWLOC_BITMAP_SUBSTRING_SIZE	32
+#define HWLOC_BITMAP_SUBSTRING_LENGTH	(HWLOC_BITMAP_SUBSTRING_SIZE/4)
+#define HWLOC_BITMAP_STRING_PER_LONG	(HWLOC_BITS_PER_LONG/HWLOC_BITMAP_SUBSTRING_SIZE)
+
+int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+  int i;
+  unsigned long accum = 0;
+  int accumed = 0;
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+  const unsigned long accum_mask = ~0UL;
+#else /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+  const unsigned long accum_mask = ((1UL << HWLOC_BITMAP_SUBSTRING_SIZE) - 1) << (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE);
+#endif /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    needcomma = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0 || accumed) {
+    /* Refill accumulator */
+    if (!accumed) {
+      accum = set->ulongs[i--];
+      accumed = HWLOC_BITS_PER_LONG;
+    }
+
+    if (accum & accum_mask) {
+      /* print the whole subset if not empty */
+        res = hwloc_snprintf(tmp, size, needcomma ? ",0x" HWLOC_PRIxSUBBITMAP : "0x" HWLOC_PRIxSUBBITMAP,
+		     (accum & accum_mask) >> (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE));
+      needcomma = 1;
+    } else if (i == -1 && accumed == HWLOC_BITMAP_SUBSTRING_SIZE) {
+      /* print a single 0 to mark the last subset */
+      res = hwloc_snprintf(tmp, size, needcomma ? ",0x0" : "0x0");
+    } else if (needcomma) {
+      res = hwloc_snprintf(tmp, size, ",");
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+    accum = 0;
+    accumed = 0;
+#else
+    accum <<= HWLOC_BITMAP_SUBSTRING_SIZE;
+    accumed -= HWLOC_BITMAP_SUBSTRING_SIZE;
+#endif
+
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  unsigned long accum = 0;
+  int count=0;
+  int infinite = 0;
+
+  /* count how many substrings there are */
+  count++;
+  while ((current = strchr(current+1, ',')) != NULL)
+    count++;
+
+  current = string;
+  if (!strncmp("0xf...f", current, 7)) {
+    current += 7;
+    if (*current != ',') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+    current++;
+    infinite = 1;
+    count--;
+  }
+
+  hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG);
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    unsigned long val;
+    char *next;
+    val = strtoul(current, &next, 16);
+
+    assert(count > 0);
+    count--;
+
+    accum |= (val << ((count * HWLOC_BITMAP_SUBSTRING_SIZE) % HWLOC_BITS_PER_LONG));
+    if (!(count % HWLOC_BITMAP_STRING_PER_LONG)) {
+      set->ulongs[count / HWLOC_BITMAP_STRING_PER_LONG] = accum;
+      accum = 0;
+    }
+
+    if (*next != ',') {
+      if (*next || count > 0)
+	goto failed;
+      else
+	break;
+    }
+    current = (const char*) next+1;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int prev = -1;
+  hwloc_bitmap_t reverse;
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  reverse = hwloc_bitmap_alloc(); /* FIXME: add hwloc_bitmap_alloc_size() + hwloc_bitmap_init_allocated() to avoid malloc? */
+  hwloc_bitmap_not(reverse, set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  while (1) {
+    int begin, end;
+
+    begin = hwloc_bitmap_next(set, prev);
+    if (begin == -1)
+      break;
+    end = hwloc_bitmap_next(reverse, begin);
+
+    if (end == begin+1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d" : "%d", begin);
+    } else if (end == -1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-" : "%d-", begin);
+    } else {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-%d" : "%d-%d", begin, end-1);
+    }
+    if (res < 0) {
+      hwloc_bitmap_free(reverse);
+      return -1;
+    }
+    ret += res;
+
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+    needcomma = 1;
+
+    if (end == -1)
+      break;
+    else
+      prev = end - 1;
+  }
+
+  hwloc_bitmap_free(reverse);
+
+  return ret;
+}
+
+int hwloc_bitmap_list_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_list_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_list_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  char *next;
+  long begin = -1, val;
+
+  hwloc_bitmap_zero(set);
+
+  while (*current != '\0') {
+
+    /* ignore empty ranges */
+    while (*current == ',')
+      current++;
+
+    val = strtoul(current, &next, 0);
+    /* make sure we got at least one digit */
+    if (next == current)
+      goto failed;
+
+    if (begin != -1) {
+      /* finishing a range */
+      hwloc_bitmap_set_range(set, begin, val);
+      begin = -1;
+
+    } else if (*next == '-') {
+      /* starting a new range */
+      if (*(next+1) == '\0') {
+	/* infinite range */
+	hwloc_bitmap_set_range(set, val, -1);
+        break;
+      } else {
+	/* normal range */
+	begin = val;
+      }
+
+    } else if (*next == ',' || *next == '\0') {
+      /* single digit */
+      hwloc_bitmap_set(set, val);
+    }
+
+    if (*next == '\0')
+      break;
+    current = next+1;
+  }
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int started = 0;
+  int i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    started = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=1 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0) {
+    unsigned long val = set->ulongs[i--];
+    if (started) {
+      /* print the whole subset */
+#if HWLOC_BITS_PER_LONG == 64
+      res = hwloc_snprintf(tmp, size, "%016lx", val);
+#else
+      res = hwloc_snprintf(tmp, size, "%08lx", val);
+#endif
+    } else if (val || i == -1) {
+      res = hwloc_snprintf(tmp, size, "0x%lx", val);
+      started = 1;
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_taskset_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_taskset_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_taskset_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  int chars;
+  int count;
+  int infinite = 0;
+
+  current = string;
+  if (!strncmp("0xf...f", current, 7)) {
+    /* infinite bitmap */
+    infinite = 1;
+    current += 7;
+    if (*current == '\0') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+  } else {
+    /* finite bitmap */
+    if (!strncmp("0x", current, 2))
+      current += 2;
+    if (*current == '\0') {
+      /* special case for empty bitmap */
+      hwloc_bitmap_zero(set);
+      return 0;
+    }
+  }
+  /* we know there are other characters now */
+
+  chars = strlen(current);
+  count = (chars * 4 + HWLOC_BITS_PER_LONG - 1) / HWLOC_BITS_PER_LONG;
+
+  hwloc_bitmap_reset_by_ulongs(set, count);
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    int tmpchars;
+    char ustr[17];
+    unsigned long val;
+    char *next;
+
+    tmpchars = chars % (HWLOC_BITS_PER_LONG/4);
+    if (!tmpchars)
+      tmpchars = (HWLOC_BITS_PER_LONG/4);
+
+    memcpy(ustr, current, tmpchars);
+    ustr[tmpchars] = '\0';
+    val = strtoul(ustr, &next, 16);
+    if (*next != '\0')
+      goto failed;
+
+    set->ulongs[count-1] = val;
+
+    current += tmpchars;
+    chars -= tmpchars;
+    count--;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+static void hwloc_bitmap__zero(struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+}
+
+void hwloc_bitmap_zero(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	hwloc_bitmap__zero(set);
+}
+
+static void hwloc_bitmap__fill(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	set->infinite = 1;
+}
+
+void hwloc_bitmap_fill(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	hwloc_bitmap__fill(set);
+}
+
+void hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	set->ulongs[0] = mask; /* there's always at least one ulong allocated */
+	set->infinite = 0;
+}
+
+void hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	unsigned j;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, i+1);
+	set->ulongs[i] = mask;
+	for(j=0; j<i; j++)
+		set->ulongs[j] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+}
+
+unsigned long hwloc_bitmap_to_ulong(const struct hwloc_bitmap_s *set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return set->ulongs[0]; /* there's always at least one ulong allocated */
+}
+
+unsigned long hwloc_bitmap_to_ith_ulong(const struct hwloc_bitmap_s *set, unsigned i)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return HWLOC_SUBBITMAP_READULONG(set, i);
+}
+
+void hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	hwloc_bitmap__zero(set);
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	hwloc_bitmap__fill(set);
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if setting inside the infinite part of the bitmap */
+	if (set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return;
+
+	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (_endcpu == -1) {
+		set->infinite = 1;
+		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
+	}
+
+	if (set->infinite) {
+		/* truncate the range according to the infinite part of the bitmap */
+		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			return;
+	}
+	if (endcpu < begincpu)
+		return;
+	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+	for(i=beginset+1; i<endset; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	if (beginset == endset) {
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	} else {
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	}
+}
+
+void hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_realloc_by_ulongs(set, i+1);
+	set->ulongs[i] = mask;
+}
+
+void hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if clearing inside the infinitely-unset part of the bitmap */
+	if (!set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return;
+
+	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (_endcpu == -1) {
+		set->infinite = 0;
+		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
+	}
+
+	if (!set->infinite) {
+		/* truncate the range according to the infinitely-unset part of the bitmap */
+		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			return;
+	}
+	if (endcpu < begincpu)
+		return;
+	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+	for(i=beginset+1; i<endset; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	if (beginset == endset) {
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	} else {
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	}
+}
+
+int hwloc_bitmap_isset(const struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	return (HWLOC_SUBBITMAP_READULONG(set, index_) & HWLOC_SUBBITMAP_CPU(cpu)) != 0;
+}
+
+int hwloc_bitmap_iszero(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_ZERO)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isfull(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (!set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isequal (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] != set2->ulongs[i])
+			return 0;
+
+	if (count1 != count2) {
+		unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		for(i=min_count; i<count1; i++) {
+			if (set1->ulongs[i] != w2)
+				return 0;
+		}
+		for(i=min_count; i<count2; i++) {
+			if (set2->ulongs[i] != w1)
+				return 0;
+		}
+	}
+
+	if (set1->infinite != set2->infinite)
+		return 0;
+
+	return 1;
+}
+
+int hwloc_bitmap_intersects (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] & set2->ulongs[i])
+			return 1;
+
+	if (count1 != count2) {
+		if (set2->infinite) {
+			for(i=min_count; i<set1->ulongs_count; i++)
+				if (set1->ulongs[i])
+					return 1;
+		}
+		if (set1->infinite) {
+			for(i=min_count; i<set2->ulongs_count; i++)
+				if (set2->ulongs[i])
+					return 1;
+		}
+	}
+
+	if (set1->infinite && set2->infinite)
+		return 1;
+
+	return 0;
+}
+
+int hwloc_bitmap_isincluded (const struct hwloc_bitmap_s *sub_set, const struct hwloc_bitmap_s *super_set)
+{
+	unsigned super_count = super_set->ulongs_count;
+	unsigned sub_count = sub_set->ulongs_count;
+	unsigned min_count = super_count < sub_count ? super_count : sub_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(sub_set);
+	HWLOC__BITMAP_CHECK(super_set);
+
+	for(i=0; i<min_count; i++)
+		if (super_set->ulongs[i] != (super_set->ulongs[i] | sub_set->ulongs[i]))
+			return 0;
+
+	if (super_count != sub_count) {
+		if (!super_set->infinite)
+			for(i=min_count; i<sub_count; i++)
+				if (sub_set->ulongs[i])
+					return 0;
+		if (sub_set->infinite)
+			for(i=min_count; i<super_count; i++)
+				if (super_set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+					return 0;
+	}
+
+	if (sub_set->infinite && !super_set->infinite)
+		return 0;
+
+	return 1;
+}
+
+void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] | set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			}
+		} else {
+			if (set1->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			}
+		}
+	}
+
+	res->infinite = set1->infinite || set2->infinite;
+}
+
+void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && set2->infinite;
+}
+
+void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & ~set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (!set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = ~set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && !set2->infinite;
+}
+
+void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] ^ set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set1->ulongs[i] ^ w2;
+		} else {
+			unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set2->ulongs[i] ^ w1;
+		}
+	}
+
+	res->infinite = (!set1->infinite) != (!set2->infinite);
+}
+
+void hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
+{
+	unsigned count = set->ulongs_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(res, count);
+
+	for(i=0; i<count; i++)
+		res->ulongs[i] = ~set->ulongs[i];
+
+	res->infinite = !set->infinite;
+}
+
+int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
+{
+	int i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=set->ulongs_count-1; i>=0; i--) {
+		/* subsets are unsigned longs, use flsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	return -1;
+}
+
+int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (i >= set->ulongs_count) {
+		if (set->infinite)
+			return prev_cpu + 1;
+		else
+			return -1;
+	}
+
+	for(; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+
+		/* if the prev cpu is in the same word as the possible next one,
+		   we need to mask out previous cpus */
+		if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+			w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+void hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	int found = 0;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		if (found) {
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+			continue;
+		} else {
+			/* subsets are unsigned longs, use ffsl */
+			unsigned long w = set->ulongs[i];
+			if (w) {
+				int _ffs = hwloc_ffsl(w);
+				set->ulongs[i] = HWLOC_SUBBITMAP_CPU(_ffs-1);
+				found = 1;
+			}
+		}
+	}
+
+	if (set->infinite) {
+		if (found) {
+			set->infinite = 0;
+		} else {
+			/* set the first non allocated bit */
+			unsigned first = set->ulongs_count * HWLOC_BITS_PER_LONG;
+			set->infinite = 0; /* do not let realloc fill the newly allocated sets */
+			hwloc_bitmap_set(set, first);
+		}
+	}
+}
+
+int hwloc_bitmap_compare_first(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++) {
+		unsigned long w1 = set1->ulongs[i];
+		unsigned long w2 = set2->ulongs[i];
+		if (w1 || w2) {
+			int _ffs1 = hwloc_ffsl(w1);
+			int _ffs2 = hwloc_ffsl(w2);
+			/* if both have a bit set, compare for real */
+			if (_ffs1 && _ffs2)
+				return _ffs1-_ffs2;
+			/* one is empty, and it is considered higher, so reverse-compare them */
+			return _ffs2-_ffs1;
+		}
+	}
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			for(i=min_count; i<count2; i++) {
+				unsigned long w2 = set2->ulongs[i];
+				if (set1->infinite)
+					return -!(w2 & 1);
+				else if (w2)
+					return 1;
+			}
+		} else {
+			for(i=min_count; i<count1; i++) {
+				unsigned long w1 = set1->ulongs[i];
+				if (set2->infinite)
+					return !(w1 & 1);
+				else if (w1)
+					return -1;
+			}
+		}
+	}
+
+	return !!set1->infinite - !!set2->infinite;
+}
+
+int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	int i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if ((!set1->infinite) != (!set2->infinite))
+		return !!set1->infinite - !!set2->infinite;
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			unsigned long val1 = set1->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=max_count-1; i>=(signed) min_count; i--) {
+				unsigned long val2 = set2->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		} else {
+			unsigned long val2 = set2->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=max_count-1; i>=(signed) min_count; i--) {
+				unsigned long val1 = set1->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		}
+	}
+
+	for(i=min_count-1; i>=0; i--) {
+		unsigned long val1 = set1->ulongs[i];
+		unsigned long val2 = set2->ulongs[i];
+		if (val1 == val2)
+			continue;
+		return val1 < val2 ? -1 : 1;
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_weight(const struct hwloc_bitmap_s * set)
+{
+	int weight = 0;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=0; i<set->ulongs_count; i++)
+		weight += hwloc_weight_long(set->ulongs[i]);
+	return weight;
+}
+
+int hwloc_bitmap_compare_inclusion(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned max_count = set1->ulongs_count > set2->ulongs_count ? set1->ulongs_count : set2->ulongs_count;
+	int result = HWLOC_BITMAP_EQUAL; /* means empty sets return equal */
+	int empty1 = 1;
+	int empty2 = 1;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<max_count; i++) {
+	  unsigned long val1 = HWLOC_SUBBITMAP_READULONG(set1, (unsigned) i);
+	  unsigned long val2 = HWLOC_SUBBITMAP_READULONG(set2, (unsigned) i);
+
+	  if (!val1) {
+	    if (!val2)
+	      /* both empty, no change */
+	      continue;
+
+	    /* val1 empty, val2 not */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+
+	  } else if (!val2) {
+	    /* val2 empty, val1 not */
+	    if (result == HWLOC_BITMAP_INCLUDED) {
+	      if (!empty1)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_CONTAINS;
+	    }
+	    /* no change otherwise */
+
+	  } else if (val1 == val2) {
+	    /* equal and not empty */
+	    if (result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains/included unchanged */
+
+	  } else if ((val1 & val2) == val1) {
+	    /* included and not empty */
+	    if (result == HWLOC_BITMAP_CONTAINS || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/included unchanged */
+	    result = HWLOC_BITMAP_INCLUDED;
+
+	  } else if ((val1 & val2) == val2) {
+	    /* contains and not empty */
+	    if (result == HWLOC_BITMAP_INCLUDED || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains unchanged */
+	    result = HWLOC_BITMAP_CONTAINS;
+
+	  } else if ((val1 & val2) != 0) {
+	    /* intersects and not empty */
+	    return HWLOC_BITMAP_INTERSECTS;
+
+	  } else {
+	    /* different and not empty */
+
+	    /* equal/included/contains with non-empty sets means intersects */
+	    if (result == HWLOC_BITMAP_EQUAL && !empty1 /* implies !empty2 */)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_INCLUDED && !empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_CONTAINS && !empty2)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* otherwise means different */
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  }
+
+	  empty1 &= !val1;
+	  empty2 &= !val2;
+	}
+
+	if (!set1->infinite) {
+	  if (set2->infinite) {
+	    /* set2 infinite only */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+	  }
+	} else if (!set2->infinite) {
+	  /* set1 infinite only */
+	  if (result == HWLOC_BITMAP_INCLUDED) {
+	    if (!empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  } else if (result == HWLOC_BITMAP_EQUAL) {
+	    result = HWLOC_BITMAP_CONTAINS;
+	  }
+	  /* no change otherwise */
+	} else {
+	  /* both infinite */
+	  if (result == HWLOC_BITMAP_DIFFERENT)
+	    return HWLOC_BITMAP_INTERSECTS;
+	  /* equal/contains/included unchanged */
+	}
+
+	return result;
+}
diff --git a/ext/hwloc/hwloc/components.c b/ext/hwloc/hwloc/components.c
new file mode 100644
index 0000000..7aa3b9d
--- /dev/null
+++ b/ext/hwloc/hwloc/components.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2012 Université Bordeau 1
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/xml.h>
+
+#define HWLOC_COMPONENT_STOP_NAME "stop"
+#define HWLOC_COMPONENT_EXCLUDE_CHAR '-'
+#define HWLOC_COMPONENT_SEPS ","
+
+/* list of all registered discovery components, sorted by priority, higher priority first.
+ * noos is last because its priority is 0.
+ * others' priority is 10.
+ */
+static struct hwloc_disc_component * hwloc_disc_components = NULL;
+
+static unsigned hwloc_components_users = 0; /* first one initializes, last ones destroys */
+
+static int hwloc_components_verbose = 0;
+#ifdef HWLOC_HAVE_PLUGINS
+static int hwloc_plugins_verbose = 0;
+#endif
+
+/* hwloc_components_mutex serializes:
+ * - loading/unloading plugins, and modifications of the hwloc_plugins list
+ * - calls to ltdl, including in hwloc_check_plugin_namespace()
+ * - registration of components with hwloc_disc_component_register()
+ *   and hwloc_xml_callbacks_register()
+ */
+#ifdef HWLOC_WIN_SYS
+/* Basic mutex on top of InterlockedCompareExchange() on windows,
+ * Far from perfect, but easy to maintain, and way enough given that this code will never be needed for real. */
+#include <windows.h>
+static LONG hwloc_components_mutex = 0;
+#define HWLOC_COMPONENTS_LOCK() do {						\
+  while (InterlockedCompareExchange(&hwloc_components_mutex, 1, 0) != 0)	\
+    SwitchToThread();								\
+} while (0)
+#define HWLOC_COMPONENTS_UNLOCK() do {						\
+  assert(hwloc_components_mutex == 1);						\
+  hwloc_components_mutex = 0;							\
+} while (0)
+
+#elif defined HWLOC_HAVE_PTHREAD_MUTEX
+/* pthread mutex if available (except on windows) */
+#include <pthread.h>
+static pthread_mutex_t hwloc_components_mutex = PTHREAD_MUTEX_INITIALIZER;
+#define HWLOC_COMPONENTS_LOCK() pthread_mutex_lock(&hwloc_components_mutex)
+#define HWLOC_COMPONENTS_UNLOCK() pthread_mutex_unlock(&hwloc_components_mutex)
+
+#else /* HWLOC_WIN_SYS || HWLOC_HAVE_PTHREAD_MUTEX */
+#error No mutex implementation available
+#endif
+
+
+#ifdef HWLOC_HAVE_PLUGINS
+
+#include <ltdl.h>
+
+/* array of pointers to dynamically loaded plugins */
+static struct hwloc__plugin_desc {
+  char *name;
+  struct hwloc_component *component;
+  char *filename;
+  lt_dlhandle handle;
+  struct hwloc__plugin_desc *next;
+} *hwloc_plugins = NULL;
+
+static int
+hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
+{
+  const char *basename;
+  lt_dlhandle handle;
+  char *componentsymbolname = NULL;
+  struct hwloc_component *component;
+  struct hwloc__plugin_desc *desc, **prevdesc;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin dlforeach found `%s'\n", filename);
+
+  basename = strrchr(filename, '/');
+  if (!basename)
+    basename = filename;
+  else
+    basename++;
+
+  /* dlopen and get the component structure */
+  handle = lt_dlopenext(filename);
+  if (!handle) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to load plugin: %s\n", lt_dlerror());
+    goto out;
+  }
+  componentsymbolname = malloc(strlen(basename)+10+1);
+  sprintf(componentsymbolname, "%s_component", basename);
+  component = lt_dlsym(handle, componentsymbolname);
+  if (!component) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to find component symbol `%s'\n",
+	      componentsymbolname);
+    goto out_with_handle;
+  }
+  if (component->abi != HWLOC_COMPONENT_ABI) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin symbol ABI %u instead of %u\n",
+	      component->abi, HWLOC_COMPONENT_ABI);
+    goto out_with_handle;
+  }
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin contains expected symbol `%s'\n",
+	    componentsymbolname);
+  free(componentsymbolname);
+  componentsymbolname = NULL;
+
+  if (HWLOC_COMPONENT_TYPE_DISC == component->type) {
+    if (strncmp(basename, "hwloc_", 6)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type DISCOVERY\n", basename);
+      goto out_with_handle;
+    }
+  } else if (HWLOC_COMPONENT_TYPE_XML == component->type) {
+    if (strncmp(basename, "hwloc_xml_", 10)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type XML\n", basename);
+      goto out_with_handle;
+    }
+  } else {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin name `%s' has invalid type %u\n",
+	      basename, (unsigned) component->type);
+    goto out_with_handle;
+  }
+
+  /* allocate a plugin_desc and queue it */
+  desc = malloc(sizeof(*desc));
+  if (!desc)
+    goto out_with_handle;
+  desc->name = strdup(basename);
+  desc->filename = strdup(filename);
+  desc->component = component;
+  desc->handle = handle;
+  desc->next = NULL;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' ready\n", basename);
+
+  /* append to the list */
+  prevdesc = &hwloc_plugins;
+  while (*prevdesc)
+    prevdesc = &((*prevdesc)->next);
+  *prevdesc = desc;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' queued\n", basename);
+  return 0;
+
+ out_with_handle:
+  lt_dlclose(handle);
+  free(componentsymbolname); /* NULL if already freed */
+ out:
+  return 0;
+}
+
+static void
+hwloc_plugins_exit(void)
+{
+  struct hwloc__plugin_desc *desc, *next;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Closing all plugins\n");
+
+  desc = hwloc_plugins;
+  while (desc) {
+    next = desc->next;
+    lt_dlclose(desc->handle);
+    free(desc->name);
+    free(desc->filename);
+    free(desc);
+    desc = next;
+  }
+  hwloc_plugins = NULL;
+
+  lt_dlexit();
+}
+
+static int
+hwloc_plugins_init(void)
+{
+  const char *verboseenv;
+  char *path = HWLOC_PLUGINS_PATH;
+  const char *env;
+  int err;
+
+  verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+  hwloc_plugins_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+  err = lt_dlinit();
+  if (err)
+    goto out;
+
+  env = getenv("HWLOC_PLUGINS_PATH");
+  if (env)
+    path = env;
+
+  hwloc_plugins = NULL;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Starting plugin dlforeach in %s\n", path);
+  err = lt_dlforeachfile(path, hwloc__dlforeach_cb, NULL);
+  if (err)
+    goto out_with_init;
+
+  return 0;
+
+ out_with_init:
+  hwloc_plugins_exit();
+ out:
+  return -1;
+}
+
+#endif /* HWLOC_HAVE_PLUGINS */
+
+static const char *
+hwloc_disc_component_type_string(hwloc_disc_component_type_t type)
+{
+  switch (type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU: return "cpu";
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL: return "global";
+  case HWLOC_DISC_COMPONENT_TYPE_MISC: return "misc";
+  default: return "**unknown**";
+  }
+}
+
+static int
+hwloc_disc_component_register(struct hwloc_disc_component *component,
+			      const char *filename)
+{
+  struct hwloc_disc_component **prev;
+
+  /* check that the component name is valid */
+  if (!strcmp(component->name, HWLOC_COMPONENT_STOP_NAME)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with reserved name `" HWLOC_COMPONENT_STOP_NAME "'\n");
+    return -1;
+  }
+  if (strchr(component->name, HWLOC_COMPONENT_EXCLUDE_CHAR)
+      || strcspn(component->name, HWLOC_COMPONENT_SEPS) != strlen(component->name)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with name `%s' containing reserved characters `%c" HWLOC_COMPONENT_SEPS "'\n",
+	      component->name, HWLOC_COMPONENT_EXCLUDE_CHAR);
+    return -1;
+  }
+  /* check that the component type is valid */
+  switch ((unsigned) component->type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU:
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL:
+  case HWLOC_DISC_COMPONENT_TYPE_MISC:
+    break;
+  default:
+    fprintf(stderr, "Cannot register discovery component `%s' with unknown type %u\n",
+	    component->name, (unsigned) component->type);
+    return -1;
+  }
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if (!strcmp((*prev)->name, component->name)) {
+      /* if two components have the same name, only keep the highest priority one */
+      if ((*prev)->priority < component->priority) {
+	/* drop the existing component */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Dropping previously registered discovery component `%s', priority %u lower than new one %u\n",
+		  (*prev)->name, (*prev)->priority, component->priority);
+	*prev = (*prev)->next;
+      } else {
+	/* drop the new one */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Ignoring new discovery component `%s', priority %u lower than previously registered one %u\n",
+		  component->name, component->priority, (*prev)->priority);
+	return -1;
+      }
+    }
+    prev = &((*prev)->next);
+  }
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Registered %s discovery component `%s' with priority %u (%s%s)\n",
+	    hwloc_disc_component_type_string(component->type), component->name, component->priority,
+	    filename ? "from plugin " : "statically build", filename ? filename : "");
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if ((*prev)->priority < component->priority)
+      break;
+    prev = &((*prev)->next);
+  }
+  component->next = *prev;
+  *prev = component;
+  return 0;
+}
+
+#include <static-components.h>
+
+static void (**hwloc_component_finalize_cbs)(unsigned long);
+static unsigned hwloc_component_finalize_cb_count;
+
+void
+hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#ifdef HWLOC_HAVE_PLUGINS
+  struct hwloc__plugin_desc *desc;
+#endif
+  const char *verboseenv;
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert((unsigned) -1 != hwloc_components_users);
+  if (0 != hwloc_components_users++) {
+    HWLOC_COMPONENTS_UNLOCK();
+    goto ok;
+  }
+
+  verboseenv = getenv("HWLOC_COMPONENTS_VERBOSE");
+  hwloc_components_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_init();
+#endif
+
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+  /* count the max number of finalize callbacks */
+  for(i=0; NULL != hwloc_static_components[i]; i++)
+    hwloc_component_finalize_cb_count++;
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next)
+    hwloc_component_finalize_cb_count++;
+#endif
+  if (hwloc_component_finalize_cb_count) {
+    hwloc_component_finalize_cbs = calloc(hwloc_component_finalize_cb_count,
+					  sizeof(*hwloc_component_finalize_cbs));
+    assert(hwloc_component_finalize_cbs);
+    /* forget that max number and recompute the real one below */
+    hwloc_component_finalize_cb_count = 0;
+  }
+
+  /* hwloc_static_components is created by configure in static-components.h */
+  for(i=0; NULL != hwloc_static_components[i]; i++) {
+    if (hwloc_static_components[i]->flags) {
+      fprintf(stderr, "Ignoring static component with invalid flags %lx\n",
+	      hwloc_static_components[i]->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (hwloc_static_components[i]->init && hwloc_static_components[i]->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring static component, failed to initialize\n");
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (hwloc_static_components[i]->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = hwloc_static_components[i]->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == hwloc_static_components[i]->type)
+      hwloc_disc_component_register(hwloc_static_components[i]->data, NULL);
+    /*else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
+      hwloc_xml_callbacks_register(hwloc_static_components[i]->data);*/
+    else
+      assert(0);
+  }
+
+  /* dynamic plugins */
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next) {
+    if (desc->component->flags) {
+      fprintf(stderr, "Ignoring plugin `%s' component with invalid flags %lx\n",
+	      desc->name, desc->component->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (desc->component->init && desc->component->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring plugin `%s', failed to initialize\n", desc->name);
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (desc->component->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = desc->component->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == desc->component->type)
+      hwloc_disc_component_register(desc->component->data, desc->filename);
+    /*else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
+      hwloc_xml_callbacks_register(desc->component->data);*/
+    else
+      assert(0);
+  }
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+
+ ok:
+  topology->backends = NULL;
+}
+
+static struct hwloc_disc_component *
+hwloc_disc_component_find(int type /* hwloc_disc_component_type_t or -1 if any */,
+			       const char *name /* name of NULL if any */)
+{
+  struct hwloc_disc_component *comp = hwloc_disc_components;
+  while (NULL != comp) {
+    if ((-1 == type || type == (int) comp->type)
+       && (NULL == name || !strcmp(name, comp->name)))
+      return comp;
+    comp = comp->next;
+  }
+  return NULL;
+}
+
+/* used by set_xml(), set_synthetic(), ... environment variables, ... to force the first backend */
+int
+hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+				  int envvar_forced,
+				  int type, const char *name,
+				  const void *data1, const void *data2, const void *data3)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  comp = hwloc_disc_component_find(type, name);
+  if (!comp) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, data1, data2, data3);
+  if (backend) {
+    backend->envvar_forced = envvar_forced;
+    if (topology->backends)
+      hwloc_backends_disable_all(topology);
+    return hwloc_backend_enable(topology, backend);
+  } else
+    return -1;
+}
+
+static int
+hwloc_disc_component_try_enable(struct hwloc_topology *topology,
+				struct hwloc_disc_component *comp,
+				const char *comparg,
+				unsigned *excludes,
+				int envvar_forced,
+				int verbose_errors)
+{
+  struct hwloc_backend *backend;
+  int err;
+
+  if ((*excludes) & comp->type) {
+    if (hwloc_components_verbose || verbose_errors)
+      fprintf(stderr, "Excluding %s discovery component `%s', conflicts with excludes 0x%x\n",
+	      hwloc_disc_component_type_string(comp->type), comp->name, *excludes);
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, comparg, NULL, NULL);
+  if (!backend) {
+    if (hwloc_components_verbose || verbose_errors)
+      fprintf(stderr, "Failed to instantiate discovery component `%s'\n", comp->name);
+    return -1;
+  }
+
+  backend->envvar_forced = envvar_forced;
+  err = hwloc_backend_enable(topology, backend);
+  if (err < 0)
+    return -1;
+
+  *excludes |= comp->excludes;
+
+  return 0;
+}
+
+void
+hwloc_disc_components_enable_others(struct hwloc_topology *topology)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+  unsigned excludes = 0;
+  int tryall = 1;
+  const char *_env;
+  char *env; /* we'll to modify the env value, so duplicate it */
+
+  _env = getenv("HWLOC_COMPONENTS");
+  env = _env ? strdup(_env) : NULL;
+
+  /* compute current excludes */
+  backend = topology->backends;
+  while (backend) {
+    excludes |= backend->component->excludes;
+    backend = backend->next;
+  }
+
+  /* enable explicitly listed components */
+  if (env) {
+    char *curenv = env;
+    size_t s;
+
+    if (topology->backends) {
+      hwloc_backends_disable_all(topology);
+      excludes = 0;
+    }
+
+    while (*curenv) {
+      s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+      if (s) {
+	char c;
+
+	if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR)
+	  goto nextname;
+
+	if (!strncmp(curenv, HWLOC_COMPONENT_STOP_NAME, s)) {
+	  tryall = 0;
+	  break;
+	}
+
+	/* save the last char and replace with \0 */
+	c = curenv[s];
+	curenv[s] = '\0';
+
+	comp = hwloc_disc_component_find(-1, curenv);
+	if (comp) {
+	  hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 1 /* envvar forced */, 1 /* envvar forced need warnings */);
+	} else {
+	  fprintf(stderr, "Cannot find discovery component `%s'\n", curenv);
+	}
+
+	/* restore chars (the second loop below needs env to be unmodified) */
+	curenv[s] = c;
+      }
+
+nextname:
+      curenv += s;
+      if (*curenv)
+	/* Skip comma */
+	curenv++;
+    }
+  }
+
+  /* env is still the same, the above loop didn't modify it */
+
+  /* now enable remaining components (except the explicitly '-'-listed ones) */
+  if (tryall) {
+    comp = hwloc_disc_components;
+    while (NULL != comp) {
+      /* check if this component was explicitly excluded in env */
+      if (env) {
+	char *curenv = env;
+	while (*curenv) {
+	  size_t s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+	  if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, comp->name, s-1)) {
+	    if (hwloc_components_verbose)
+	      fprintf(stderr, "Excluding %s discovery component `%s' because of HWLOC_COMPONENTS environment variable\n",
+	    hwloc_disc_component_type_string(comp->type), comp->name);
+	    goto nextcomp;
+	  }
+	  curenv += s;
+	  if (*curenv)
+	    /* Skip comma */
+	    curenv++;
+	}
+      }
+      hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 0 /* defaults, not envvar forced */, 0 /* defaults don't need warnings on conflicts */);
+nextcomp:
+      comp = comp->next;
+    }
+  }
+
+  if (hwloc_components_verbose) {
+    /* print a summary */
+    int first = 1;
+    backend = topology->backends;
+    fprintf(stderr, "Final list of enabled discovery components: ");
+    while (backend != NULL) {
+      fprintf(stderr, "%s%s", first ? "" : ",", backend->component->name);
+      backend = backend->next;
+      first = 0;
+    }
+    fprintf(stderr, "\n");
+  }
+
+  if (env)
+    free(env);
+}
+
+void
+hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert(0 != hwloc_components_users);
+  if (0 != --hwloc_components_users) {
+    HWLOC_COMPONENTS_UNLOCK();
+    return;
+  }
+
+  for(i=0; i<hwloc_component_finalize_cb_count; i++)
+    hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count-i-1](0);
+  free(hwloc_component_finalize_cbs);
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+
+  /* no need to unlink/free the list of components, they'll be unloaded below */
+
+  hwloc_disc_components = NULL;
+//  hwloc_xml_callbacks_reset();
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_exit();
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+}
+
+struct hwloc_backend *
+hwloc_backend_alloc(struct hwloc_disc_component *component)
+{
+  struct hwloc_backend * backend = malloc(sizeof(*backend));
+  if (!backend) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  backend->component = component;
+  backend->flags = 0;
+  backend->discover = NULL;
+  backend->get_obj_cpuset = NULL;
+  backend->notify_new_object = NULL;
+  backend->disable = NULL;
+  backend->is_thissystem = -1;
+  backend->next = NULL;
+  backend->envvar_forced = 0;
+  return backend;
+}
+
+static void
+hwloc_backend_disable(struct hwloc_backend *backend)
+{
+  if (backend->disable)
+    backend->disable(backend);
+  free(backend);
+}
+
+int
+hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend)
+{
+  struct hwloc_backend **pprev;
+
+  /* check backend flags */
+  if (backend->flags & (~(HWLOC_BACKEND_FLAG_NEED_LEVELS))) {
+    fprintf(stderr, "Cannot enable %s discovery component `%s' with unknown flags %lx\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name, backend->flags);
+    return -1;
+  }
+
+  /* make sure we didn't already enable this backend, we don't want duplicates */
+  pprev = &topology->backends;
+  while (NULL != *pprev) {
+    if ((*pprev)->component == backend->component) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Cannot enable %s discovery component `%s' twice\n",
+		hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+      hwloc_backend_disable(backend);
+      errno = EBUSY;
+      return -1;
+    }
+    pprev = &((*pprev)->next);
+  }
+
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Enabling %s discovery component `%s'\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+
+  /* enqueue at the end */
+  pprev = &topology->backends;
+  while (NULL != *pprev)
+    pprev = &((*pprev)->next);
+  backend->next = *pprev;
+  *pprev = backend;
+
+  backend->topology = topology;
+
+  return 0;
+}
+
+void
+hwloc_backends_is_thissystem(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+  const char *local_env;
+
+  /* Apply is_thissystem topology flag before we enforce envvar backends.
+   * If the application changed the backend with set_foo(),
+   * it may use set_flags() update the is_thissystem flag here.
+   * If it changes the backend with environment variables below,
+   * it may use HWLOC_THISSYSTEM envvar below as well.
+   */
+
+  topology->is_thissystem = 1;
+
+  /* apply thissystem from normally-given backends (envvar_forced=0, either set_foo() or defaults) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 0 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override set_foo() with flags */
+  if (topology->flags & HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)
+    topology->is_thissystem = 1;
+
+  /* now apply envvar-forced backend (envvar_forced=1) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 1 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override with envvar-given flag */
+  local_env = getenv("HWLOC_THISSYSTEM");
+  if (local_env)
+    topology->is_thissystem = atoi(local_env);
+}
+
+int
+hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+  struct hwloc_topology *topology = caller->topology;
+  struct hwloc_backend *backend = topology->backends;
+  /* use the first backend's get_obj_cpuset callback */
+  while (backend != NULL) {
+    if (backend->get_obj_cpuset)
+      return backend->get_obj_cpuset(backend, caller, obj, cpuset);
+    backend = backend->next;
+  }
+  return -1;
+}
+
+int
+hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj)
+{
+  struct hwloc_backend *backend;
+  int res = 0;
+
+  backend = caller->topology->backends;
+  while (NULL != backend) {
+    if (backend != caller && backend->notify_new_object)
+      res += backend->notify_new_object(backend, caller, obj);
+    backend = backend->next;
+  }
+
+  return res;
+}
+
+void
+hwloc_backends_disable_all(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+
+  while (NULL != (backend = topology->backends)) {
+    struct hwloc_backend *next = backend->next;
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Disabling %s discovery component `%s'\n",
+	      hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+    hwloc_backend_disable(backend);
+    topology->backends = next;
+  }
+  topology->backends = NULL;
+}
diff --git a/ext/hwloc/hwloc/diff.c b/ext/hwloc/hwloc/diff.c
new file mode 100644
index 0000000..ee401d2
--- /dev/null
+++ b/ext/hwloc/hwloc/diff.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+int hwloc_topology_diff_destroy(hwloc_topology_t topology __hwloc_attribute_unused,
+				hwloc_topology_diff_t diff)
+{
+	hwloc_topology_diff_t next;
+	while (diff) {
+		next = diff->generic.next;
+		switch (diff->generic.type) {
+		default:
+			break;
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+			switch (diff->obj_attr.diff.generic.type) {
+			default:
+				break;
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+				free(diff->obj_attr.diff.string.name);
+				free(diff->obj_attr.diff.string.oldvalue);
+				free(diff->obj_attr.diff.string.newvalue);
+				break;
+			}
+			break;
+		}
+		free(diff);
+		diff = next;
+	}
+	return 0;
+}
+
+/************************
+ * Computing diffs
+ */
+
+static void hwloc_append_diff(hwloc_topology_diff_t newdiff,
+			      hwloc_topology_diff_t *firstdiffp,
+			      hwloc_topology_diff_t *lastdiffp)
+{
+	if (*firstdiffp)
+		(*lastdiffp)->generic.next = newdiff;
+	else
+		*firstdiffp = newdiff;
+	*lastdiffp = newdiff;
+	newdiff->generic.next = NULL;
+}
+
+static int hwloc_append_diff_too_complex(hwloc_obj_t obj1,
+					 hwloc_topology_diff_t *firstdiffp,
+					 hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->too_complex.type = HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX;
+	newdiff->too_complex.obj_depth = obj1->depth;
+	newdiff->too_complex.obj_index = obj1->logical_index;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_string(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     const char *name,
+					     const char *oldvalue,
+					     const char *newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.string.type = type;
+	newdiff->obj_attr.diff.string.name = name ? strdup(name) : NULL;
+	newdiff->obj_attr.diff.string.oldvalue = oldvalue ? strdup(oldvalue) : NULL;
+	newdiff->obj_attr.diff.string.newvalue = newvalue ? strdup(newvalue) : NULL;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_uint64(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     hwloc_uint64_t idx,
+					     hwloc_uint64_t oldvalue,
+					     hwloc_uint64_t newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.uint64.type = type;
+	newdiff->obj_attr.diff.uint64.index = idx;
+	newdiff->obj_attr.diff.uint64.oldvalue = oldvalue;
+	newdiff->obj_attr.diff.uint64.newvalue = newvalue;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int
+hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
+		 hwloc_topology_t topo2, hwloc_obj_t obj2,
+		 unsigned flags,
+		 hwloc_topology_diff_t *firstdiffp, hwloc_topology_diff_t *lastdiffp)
+{
+	unsigned i;
+	int err;
+	hwloc_obj_t child1, child2;
+
+	if (obj1->depth != obj2->depth)
+		goto out_too_complex;
+	if (obj1->type != obj2->type)
+		goto out_too_complex;
+
+	if (obj1->os_index != obj2->os_index)
+		/* we could allow different os_index for non-PU non-NUMAnode objects
+		 * but it's likely useless anyway */
+		goto out_too_complex;
+
+#define _SETS_DIFFERENT(_set1, _set2) \
+ (   ( !(_set1) != !(_set2) ) \
+  || ( (_set1) && !hwloc_bitmap_isequal(_set1, _set2) ) )
+#define SETS_DIFFERENT(_set, _obj1, _obj2) _SETS_DIFFERENT((_obj1)->_set, (_obj2)->_set)
+	if (SETS_DIFFERENT(cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(allowed_cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(nodeset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_nodeset, obj1, obj2)
+	    || SETS_DIFFERENT(allowed_nodeset, obj1, obj2))
+		goto out_too_complex;
+
+	/* no need to check logical_index, sibling_rank, symmetric_subtree,
+	 * the parents did it */
+
+	if ((!obj1->name) != (!obj2->name)
+	    || (obj1->name && strcmp(obj1->name, obj2->name))) {
+		err = hwloc_append_diff_obj_attr_string(obj1,
+						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+						       NULL,
+						       obj1->name,
+						       obj2->name,
+						       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+
+	/* memory */
+	if (obj1->memory.local_memory != obj2->memory.local_memory) {
+		err = hwloc_append_diff_obj_attr_uint64(obj1,
+						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+						       0,
+						       obj1->memory.local_memory,
+						       obj2->memory.local_memory,
+						       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	/* ignore memory page_types */
+
+	/* type-specific attrs */
+	switch (obj1->type) {
+	default:
+		break;
+	case HWLOC_OBJ_CACHE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->cache)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_GROUP:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->group)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_PCI_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->pcidev)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_BRIDGE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->bridge)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_OS_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->osdev)))
+			goto out_too_complex;
+		break;
+	}
+
+	/* distances */
+	if (obj1->distances_count != obj2->distances_count)
+		goto out_too_complex;
+	for(i=0; i<obj1->distances_count; i++) {
+		struct hwloc_distances_s *d1 = obj1->distances[i], *d2 = obj2->distances[i];
+		if (d1->relative_depth != d2->relative_depth
+		    || d1->nbobjs != d2->nbobjs
+		    || d1->latency_max != d2->latency_max
+		    || d1->latency_base != d2->latency_base
+		    || memcmp(d1->latency, d2->latency, d1->nbobjs * d1->nbobjs * sizeof(*d1->latency)))
+			goto out_too_complex;
+	}
+
+	/* infos */
+	if (obj1->infos_count != obj2->infos_count)
+		goto out_too_complex;
+	for(i=0; i<obj1->infos_count; i++) {
+		if (strcmp(obj1->infos[i].name, obj2->infos[i].name))
+			goto out_too_complex;
+		if (strcmp(obj1->infos[i].value, obj2->infos[i].value)) {
+			err = hwloc_append_diff_obj_attr_string(obj1,
+							       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
+							       obj1->infos[i].name,
+							       obj1->infos[i].value,
+							       obj2->infos[i].value,
+							       firstdiffp, lastdiffp);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	/* ignore userdata */
+
+	/* children */
+	for(child1 = obj1->first_child, child2 = obj2->first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* I/O children */
+	for(child1 = obj1->io_first_child, child2 = obj2->io_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* misc children */
+	for(child1 = obj1->misc_first_child, child2 = obj2->misc_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	return 0;
+
+out_too_complex:
+	hwloc_append_diff_too_complex(obj1, firstdiffp, lastdiffp);
+	return 0;
+}
+
+int hwloc_topology_diff_build(hwloc_topology_t topo1,
+			      hwloc_topology_t topo2,
+			      unsigned long flags,
+			      hwloc_topology_diff_t *diffp)
+{
+	hwloc_topology_diff_t lastdiff, tmpdiff;
+	int err;
+
+	if (flags != 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	*diffp = NULL;
+	err = hwloc_diff_trees(topo1, hwloc_get_root_obj(topo1),
+			       topo2, hwloc_get_root_obj(topo2),
+			       flags,
+			       diffp, &lastdiff);
+
+	if (!err) {
+		tmpdiff = *diffp;
+		while (tmpdiff) {
+			if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+				err = 1;
+				break;
+			}
+			tmpdiff = tmpdiff->generic.next;
+		}
+	}
+
+	return err;
+}
+
+/********************
+ * Applying diffs
+ */
+
+static int
+hwloc_apply_diff_one(hwloc_topology_t topology,
+		     hwloc_topology_diff_t diff,
+		     unsigned long flags)
+{
+	int reverse = !!(flags & HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+
+	switch (diff->generic.type) {
+	case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+		struct hwloc_topology_diff_obj_attr_s *obj_attr = &diff->obj_attr;
+		hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, obj_attr->obj_depth, obj_attr->obj_index);
+		if (!obj)
+			return -1;
+
+		switch (obj_attr->diff.generic.type) {
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE: {
+			hwloc_obj_t tmpobj;
+			hwloc_uint64_t oldvalue = reverse ? obj_attr->diff.uint64.newvalue : obj_attr->diff.uint64.oldvalue;
+			hwloc_uint64_t newvalue = reverse ? obj_attr->diff.uint64.oldvalue : obj_attr->diff.uint64.newvalue;
+			hwloc_uint64_t valuediff = newvalue - oldvalue;
+			if (obj->memory.local_memory != oldvalue)
+				return -1;
+			obj->memory.local_memory = newvalue;
+			tmpobj = obj;
+			while (tmpobj) {
+				tmpobj->memory.total_memory += valuediff;
+				tmpobj = tmpobj->parent;
+			}
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME: {
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			if (!obj->name || strcmp(obj->name, oldvalue))
+				return -1;
+			free(obj->name);
+			obj->name = strdup(newvalue);
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO: {
+			const char *name = obj_attr->diff.string.name;
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			unsigned i;
+			int found = 0;
+			for(i=0; i<obj->infos_count; i++) {
+				if (!strcmp(obj->infos[i].name, name)
+				    && !strcmp(obj->infos[i].value, oldvalue)) {
+					free(obj->infos[i].value);
+					obj->infos[i].value = strdup(newvalue);
+					found = 1;
+					break;
+				}
+			}
+			if (!found)
+				return -1;
+			break;
+		}
+		default:
+			return -1;
+		}
+
+		break;
+	}
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+int hwloc_topology_diff_apply(hwloc_topology_t topology,
+			      hwloc_topology_diff_t diff,
+			      unsigned long flags)
+{
+	hwloc_topology_diff_t tmpdiff, tmpdiff2;
+	int err, nr;
+
+	if (flags & ~HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	tmpdiff = diff;
+	nr = 0;
+	while (tmpdiff) {
+		nr++;
+		err = hwloc_apply_diff_one(topology, tmpdiff, flags);
+		if (err < 0)
+			goto cancel;
+		tmpdiff = tmpdiff->generic.next;
+	}
+	return 0;
+
+cancel:
+	tmpdiff2 = tmpdiff;
+	tmpdiff = diff;
+	while (tmpdiff != tmpdiff2) {
+		hwloc_apply_diff_one(topology, tmpdiff, flags ^ HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+		tmpdiff = tmpdiff->generic.next;
+	}
+	errno = EINVAL;
+	return -nr; /* return the index (starting at 1) of the first element that couldn't be applied */
+}
diff --git a/ext/hwloc/hwloc/distances.c b/ext/hwloc/hwloc/distances.c
new file mode 100644
index 0000000..51382b1
--- /dev/null
+++ b/ext/hwloc/hwloc/distances.c
@@ -0,0 +1,995 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2011-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <float.h>
+#include <math.h>
+
+/**************************
+ * Main Init/Clear/Destroy
+ */
+
+/* called during topology init */
+void hwloc_distances_init(struct hwloc_topology *topology)
+{
+  topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/* called during topology destroy */
+void hwloc_distances_destroy(struct hwloc_topology * topology)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  while ((osdist = next) != NULL) {
+    next = osdist->next;
+    /* remove final distance matrics AND physically-ordered ones */
+    free(osdist->indexes);
+    free(osdist->objs);
+    free(osdist->distances);
+    free(osdist);
+  }
+  topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/******************************************************
+ * Inserting distances in the topology
+ * from a backend, from the environment or by the user
+ */
+
+/* insert a distance matrix in the topology.
+ * the caller gives us those pointers, we take care of freeing them later and so on.
+ */
+void hwloc_distances_set(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+			 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances,
+			 int force)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  /* look for existing distances for the same type */
+  while ((osdist = next) != NULL) {
+    next = osdist->next;
+    if (osdist->type == type) {
+      if (osdist->forced && !force) {
+	/* there is a forced distance element, ignore the new non-forced one */
+	free(indexes);
+	free(objs);
+	free(distances);
+	return;
+      } else if (force) {
+	/* we're forcing a new distance, remove the old ones */
+	free(osdist->indexes);
+	free(osdist->objs);
+	free(osdist->distances);
+	/* remove current object */
+	if (osdist->prev)
+	  osdist->prev->next = next;
+	else
+	  topology->first_osdist = next;
+	if (next)
+	  next->prev = osdist->prev;
+	else
+	  topology->last_osdist = osdist->prev;
+	/* free current object */
+	free(osdist);
+      }
+    }
+  }
+
+  if (!nbobjs)
+    /* we're just clearing, return now */
+    return;
+
+  /* create the new element */
+  osdist = malloc(sizeof(struct hwloc_os_distances_s));
+  osdist->nbobjs = nbobjs;
+  osdist->indexes = indexes;
+  osdist->objs = objs;
+  osdist->distances = distances;
+  osdist->forced = force;
+  osdist->type = type;
+  /* insert it */
+  osdist->next = NULL;
+  osdist->prev = topology->last_osdist;
+  if (topology->last_osdist)
+    topology->last_osdist->next = osdist;
+  else
+    topology->first_osdist = osdist;
+  topology->last_osdist = osdist;
+}
+
+/* make sure a user-given distance matrix is sane */
+static int hwloc_distances__check_matrix(hwloc_topology_t __hwloc_restrict topology __hwloc_attribute_unused, hwloc_obj_type_t type __hwloc_attribute_unused,
+					 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs __hwloc_attribute_unused, float *distances __hwloc_attribute_unused)
+{
+  unsigned i,j;
+  /* make sure we don't have the same index twice */
+  for(i=0; i<nbobjs; i++)
+    for(j=i+1; j<nbobjs; j++)
+      if (indexes[i] == indexes[j]) {
+	errno = EINVAL;
+	return -1;
+      }
+  return 0;
+}
+
+static void hwloc_distances__set_from_string(struct hwloc_topology *topology,
+					     hwloc_obj_type_t type, const char *string)
+{
+  /* the string format is: "index[0],...,index[N-1]:distance[0],...,distance[N*N-1]"
+   * or "index[0],...,index[N-1]:X*Y" or "index[0],...,index[N-1]:X*Y*Z"
+   */
+  const char *tmp = string, *next;
+  unsigned *indexes;
+  float *distances;
+  unsigned nbobjs = 0, i, j, x, y, z;
+
+  if (!strcmp(string, "none")) {
+    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+    return;
+  }
+
+  if (sscanf(string, "%u-%u:", &i, &j) == 2) {
+    /* range i-j */
+    nbobjs = j-i+1;
+    indexes = calloc(nbobjs, sizeof(unsigned));
+    distances = calloc(nbobjs*nbobjs, sizeof(float));
+    /* make sure the user didn't give a veeeeery large range */
+    if (!indexes || !distances) {
+      free(indexes);
+      free(distances);
+      return;
+    }
+    for(j=0; j<nbobjs; j++)
+      indexes[j] = j+i;
+    tmp = strchr(string, ':') + 1;
+
+  } else {
+    /* explicit list of indexes, count them */
+    while (1) {
+      size_t size = strspn(tmp, "0123456789");
+      if (tmp[size] != ',') {
+	/* last element */
+	tmp += size;
+	nbobjs++;
+	break;
+      }
+      /* another index */
+      tmp += size+1;
+      nbobjs++;
+    }
+
+    if (*tmp != ':') {
+      fprintf(stderr, "Ignoring %s distances from environment variable, missing colon\n",
+	      hwloc_obj_type_string(type));
+      return;
+    }
+
+    indexes = calloc(nbobjs, sizeof(unsigned));
+    distances = calloc(nbobjs*nbobjs, sizeof(float));
+    tmp = string;
+
+    /* parse indexes */
+    for(i=0; i<nbobjs; i++) {
+      indexes[i] = strtoul(tmp, (char **) &next, 0);
+      tmp = next+1;
+    }
+  }
+
+
+  /* parse distances */
+  z=1; /* default if sscanf finds only 2 values below */
+  if (sscanf(tmp, "%u*%u*%u", &x, &y, &z) >= 2) {
+    /* generate the matrix to create x groups of y elements */
+    if (x*y*z != nbobjs) {
+      fprintf(stderr, "Ignoring %s distances from environment variable, invalid grouping (%u*%u*%u=%u instead of %u)\n",
+	      hwloc_obj_type_string(type), x, y, z, x*y*z, nbobjs);
+      free(indexes);
+      free(distances);
+      return;
+    }
+    for(i=0; i<nbobjs; i++)
+      for(j=0; j<nbobjs; j++)
+	if (i==j)
+	  distances[i*nbobjs+j] = 1;
+	else if (i/z == j/z)
+	  distances[i*nbobjs+j] = 2;
+	else if (i/z/y == j/z/y)
+	  distances[i*nbobjs+j] = 4;
+	else
+	  distances[i*nbobjs+j] = 8;
+
+  } else {
+    /* parse a comma separated list of distances */
+    for(i=0; i<nbobjs*nbobjs; i++) {
+      distances[i] = (float) atof(tmp);
+      next = strchr(tmp, ',');
+      if (next) {
+        tmp = next+1;
+      } else if (i!=nbobjs*nbobjs-1) {
+	fprintf(stderr, "Ignoring %s distances from environment variable, not enough values (%u out of %u)\n",
+		hwloc_obj_type_string(type), i+1, nbobjs*nbobjs);
+	free(indexes);
+	free(distances);
+	return;
+      }
+    }
+  }
+
+  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0) {
+    fprintf(stderr, "Ignoring invalid %s distances from environment variable\n", hwloc_obj_type_string(type));
+    free(indexes);
+    free(distances);
+    return;
+  }
+
+  hwloc_distances_set(topology, type, nbobjs, indexes, NULL, distances, 1 /* force */);
+}
+
+/* take distances in the environment, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled
+ */
+void hwloc_distances_set_from_env(struct hwloc_topology *topology)
+{
+  hwloc_obj_type_t type;
+  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+    const char *env;
+    char envname[64];
+    snprintf(envname, sizeof(envname), "HWLOC_%s_DISTANCES", hwloc_obj_type_string(type));
+    env = getenv(envname);
+    if (env) {
+      hwloc_localeswitch_declare;
+      hwloc_localeswitch_init();
+      hwloc_distances__set_from_string(topology, type, env);
+      hwloc_localeswitch_fini();
+    }
+  }
+}
+
+/* The actual set() function exported to the user
+ *
+ * take the given distance, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled.
+ */
+int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+				       unsigned nbobjs, unsigned *indexes, float *distances)
+{
+  unsigned *_indexes;
+  float *_distances;
+
+  if (!nbobjs && !indexes && !distances) {
+    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+    return 0;
+  }
+
+  if (!nbobjs || !indexes || !distances)
+    return -1;
+
+  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0)
+    return -1;
+
+  /* copy the input arrays and give them to the topology */
+  _indexes = malloc(nbobjs*sizeof(unsigned));
+  memcpy(_indexes, indexes, nbobjs*sizeof(unsigned));
+  _distances = malloc(nbobjs*nbobjs*sizeof(float));
+  memcpy(_distances, distances, nbobjs*nbobjs*sizeof(float));
+  hwloc_distances_set(topology, type, nbobjs, _indexes, NULL, _distances, 1 /* force */);
+
+  return 0;
+}
+
+/************************
+ * Restricting distances
+ */
+
+/* called when some objects have been removed because empty/ignored/cgroup/restrict,
+ * we must rebuild the list of objects from indexes (in hwloc_distances_finalize_os())
+ */
+void hwloc_distances_restrict_os(struct hwloc_topology *topology)
+{
+  struct hwloc_os_distances_s * osdist;
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+    /* remove the objs array, we'll rebuild it from the indexes
+     * depending on remaining objects */
+    free(osdist->objs);
+    osdist->objs = NULL;
+  }
+}
+
+
+/* cleanup everything we created from distances so that we may rebuild them
+ * at the end of restrict()
+ */
+void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags)
+{
+  if (flags & HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES) {
+    /* some objects may have been removed, clear objects arrays so that finalize_os rebuilds them properly */
+    hwloc_distances_restrict_os(topology);
+  } else {
+    /* if not adapting distances, drop everything */
+    hwloc_distances_destroy(topology);
+  }
+}
+
+/**************************************************************
+ * Convert user/env given array of indexes into actual objects
+ */
+
+static hwloc_obj_t hwloc_find_obj_by_type_and_os_index(hwloc_obj_t root, hwloc_obj_type_t type, unsigned os_index)
+{
+  hwloc_obj_t child;
+  if (root->type == type && root->os_index == os_index)
+    return root;
+  child = root->first_child;
+  while (child) {
+    hwloc_obj_t found = hwloc_find_obj_by_type_and_os_index(child, type, os_index);
+    if (found)
+      return found;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/* convert distance indexes that were previously stored in the topology
+ * into actual objects if not done already.
+ * it's already done when distances come from backends (this function should not be called then).
+ * it's not done when distances come from the user.
+ *
+ * returns -1 if the matrix was invalid
+ */
+static int
+hwloc_distances__finalize_os(struct hwloc_topology *topology, struct hwloc_os_distances_s *osdist)
+{
+  unsigned nbobjs = osdist->nbobjs;
+  unsigned *indexes = osdist->indexes;
+  float *distances = osdist->distances;
+  unsigned i, j;
+  hwloc_obj_type_t type = osdist->type;
+  hwloc_obj_t *objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+
+  assert(!osdist->objs);
+
+  /* traverse the topology and look for the relevant objects */
+  for(i=0; i<nbobjs; i++) {
+    hwloc_obj_t obj = hwloc_find_obj_by_type_and_os_index(topology->levels[0][0], type, indexes[i]);
+    if (!obj) {
+
+      /* shift the matrix */
+#define OLDPOS(i,j) (distances+(i)*nbobjs+(j))
+#define NEWPOS(i,j) (distances+(i)*(nbobjs-1)+(j))
+      if (i>0) {
+	/** no need to move beginning of 0th line */
+	for(j=0; j<i-1; j++)
+	  /** move end of jth line + beginning of (j+1)th line */
+	  memmove(NEWPOS(j,i), OLDPOS(j,i+1), (nbobjs-1)*sizeof(*distances));
+	/** move end of (i-1)th line */
+	memmove(NEWPOS(i-1,i), OLDPOS(i-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+      }
+      if (i<nbobjs-1) {
+	/** move beginning of (i+1)th line */
+	memmove(NEWPOS(i,0), OLDPOS(i+1,0), i*sizeof(*distances));
+	/** move end of jth line + beginning of (j+1)th line */
+	for(j=i; j<nbobjs-2; j++)
+	  memmove(NEWPOS(j,i), OLDPOS(j+1,i+1), (nbobjs-1)*sizeof(*distances));
+	/** move end of (nbobjs-2)th line */
+	memmove(NEWPOS(nbobjs-2,i), OLDPOS(nbobjs-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+      }
+
+      /* shift the indexes array */
+      memmove(indexes+i, indexes+i+1, (nbobjs-i-1)*sizeof(*indexes));
+
+      /* update counters */
+      nbobjs--;
+      i--;
+      continue;
+    }
+    objs[i] = obj;
+  }
+
+  osdist->nbobjs = nbobjs;
+  if (!nbobjs) {
+    /* the whole matrix was invalid, let the caller remove this distances */
+    free(objs);
+    return -1;
+  }
+
+  /* setup the objs array */
+  osdist->objs = objs;
+  return 0;
+}
+
+
+void hwloc_distances_finalize_os(struct hwloc_topology *topology)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  while ((osdist = next) != NULL) {
+    int err;
+    next = osdist->next;
+
+    /* remove final distance matrics AND physically-ordered ones */
+
+    if (osdist->objs)
+      /* nothing to do, switch to the next element */
+      continue;
+
+    err = hwloc_distances__finalize_os(topology, osdist);
+    if (!err)
+      /* convert ok, switch to the next element */
+      continue;
+
+    /* remove this element */
+    free(osdist->indexes);
+    free(osdist->distances);
+    /* remove current object */
+    if (osdist->prev)
+      osdist->prev->next = next;
+    else
+      topology->first_osdist = next;
+    if (next)
+      next->prev = osdist->prev;
+    else
+      topology->last_osdist = osdist->prev;
+    /* free current object */
+    free(osdist);
+  }
+}
+
+/***********************************************************
+ * Convert internal distances given by the backend/env/user
+ * into exported logical distances attached to objects
+ */
+
+static void
+hwloc_distances__finalize_logical(struct hwloc_topology *topology,
+				  unsigned nbobjs,
+				  hwloc_obj_t *objs, float *osmatrix)
+{
+  unsigned i, j, li, lj, minl;
+  float min = FLT_MAX, max = FLT_MIN;
+  hwloc_obj_t root;
+  float *matrix;
+  hwloc_cpuset_t cpuset, complete_cpuset;
+  hwloc_nodeset_t nodeset, complete_nodeset;
+  unsigned relative_depth;
+  int idx;
+
+  /* find the root */
+  cpuset = hwloc_bitmap_alloc();
+  complete_cpuset = hwloc_bitmap_alloc();
+  nodeset = hwloc_bitmap_alloc();
+  complete_nodeset = hwloc_bitmap_alloc();
+  for(i=0; i<nbobjs; i++) {
+    hwloc_bitmap_or(cpuset, cpuset, objs[i]->cpuset);
+    hwloc_bitmap_or(complete_cpuset, complete_cpuset, objs[i]->complete_cpuset);
+    hwloc_bitmap_or(nodeset, nodeset, objs[i]->nodeset);
+    hwloc_bitmap_or(complete_nodeset, complete_nodeset, objs[i]->complete_nodeset);
+  }
+  /* find the object covering cpuset, we'll take care of the nodeset later */
+  root = hwloc_get_obj_covering_cpuset(topology, cpuset);
+  /* walk up to find a parent that also covers the nodeset and complete sets */
+  while (root &&
+	 (!hwloc_bitmap_isincluded(nodeset, root->nodeset)
+	  || !hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset)
+	  || !hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset)))
+    root = root->parent;
+  if (!root) {
+    /* should not happen, ignore the distance matrix and report an error. */
+    if (!hwloc_hide_errors()) {
+      char *a, *b;
+      hwloc_bitmap_asprintf(&a, cpuset);
+      hwloc_bitmap_asprintf(&b, nodeset);
+      fprintf(stderr, "****************************************************************************\n");
+      fprintf(stderr, "* hwloc %s has encountered an error when adding a distance matrix to the topology.\n", HWLOC_VERSION);
+      fprintf(stderr, "*\n");
+      fprintf(stderr, "* hwloc_distances__finalize_logical() could not find any object covering\n");
+      fprintf(stderr, "* cpuset %s and nodeset %s\n", a, b);
+      fprintf(stderr, "*\n");
+      fprintf(stderr, "* Please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+      fprintf(stderr, "* along with the output from the hwloc-gather-topology script.\n");
+#else
+      fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+      fprintf(stderr, "****************************************************************************\n");
+      free(a);
+      free(b);
+    }
+    hwloc_bitmap_free(cpuset);
+    hwloc_bitmap_free(complete_cpuset);
+    hwloc_bitmap_free(nodeset);
+    hwloc_bitmap_free(complete_nodeset);
+    return;
+  }
+  /* ideally, root has the exact cpuset and nodeset.
+   * but ignoring or other things that remove objects may cause the object array to reduce */
+  assert(hwloc_bitmap_isincluded(cpuset, root->cpuset));
+  assert(hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset));
+  assert(hwloc_bitmap_isincluded(nodeset, root->nodeset));
+  assert(hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset));
+  hwloc_bitmap_free(cpuset);
+  hwloc_bitmap_free(complete_cpuset);
+  hwloc_bitmap_free(nodeset);
+  hwloc_bitmap_free(complete_nodeset);
+  if (root->depth >= objs[0]->depth) {
+    /* strange topology led us to find invalid relative depth, ignore */
+    return;
+  }
+  relative_depth = objs[0]->depth - root->depth; /* this assume that we have distances between objects of the same level */
+
+  if (nbobjs != hwloc_get_nbobjs_inside_cpuset_by_depth(topology, root->cpuset, root->depth + relative_depth))
+    /* the root does not cover the right number of objects, maybe we failed to insert a root (bad intersect or so). */
+    return;
+
+  /* get the logical index offset, it's the min of all logical indexes */
+  minl = UINT_MAX;
+  for(i=0; i<nbobjs; i++)
+    if (minl > objs[i]->logical_index)
+      minl = objs[i]->logical_index;
+
+  /* compute/check min/max values */
+  for(i=0; i<nbobjs; i++)
+    for(j=0; j<nbobjs; j++) {
+      float val = osmatrix[i*nbobjs+j];
+      if (val < min)
+	min = val;
+      if (val > max)
+	max = val;
+    }
+  if (!min) {
+    /* Linux up to 2.6.36 reports ACPI SLIT distances, which should be memory latencies.
+     * Except of SGI IP27 (SGI Origin 200/2000 with MIPS processors) where the distances
+     * are the number of hops between routers.
+     */
+    hwloc_debug("%s", "minimal distance is 0, matrix does not seem to contain latencies, ignoring\n");
+    return;
+  }
+
+  /* store the normalized latency matrix in the root object */
+  idx = root->distances_count++;
+  root->distances = realloc(root->distances, root->distances_count * sizeof(struct hwloc_distances_s *));
+  root->distances[idx] = malloc(sizeof(struct hwloc_distances_s));
+  root->distances[idx]->relative_depth = relative_depth;
+  root->distances[idx]->nbobjs = nbobjs;
+  root->distances[idx]->latency = matrix = malloc(nbobjs*nbobjs*sizeof(float));
+  root->distances[idx]->latency_base = (float) min;
+#define NORMALIZE_LATENCY(d) ((d)/(min))
+  root->distances[idx]->latency_max = NORMALIZE_LATENCY(max);
+  for(i=0; i<nbobjs; i++) {
+    li = objs[i]->logical_index - minl;
+    matrix[li*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+i]);
+    for(j=i+1; j<nbobjs; j++) {
+      lj = objs[j]->logical_index - minl;
+      matrix[li*nbobjs+lj] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+j]);
+      matrix[lj*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[j*nbobjs+i]);
+    }
+  }
+}
+
+/* convert internal distances into logically-ordered distances
+ * that can be exposed in the API
+ */
+void
+hwloc_distances_finalize_logical(struct hwloc_topology *topology)
+{
+  unsigned nbobjs;
+  int depth;
+  struct hwloc_os_distances_s * osdist;
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+    nbobjs = osdist->nbobjs;
+    if (!nbobjs)
+      continue;
+
+    depth = hwloc_get_type_depth(topology, osdist->type);
+    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+      continue;
+
+    if (osdist->objs) {
+      assert(osdist->distances);
+      hwloc_distances__finalize_logical(topology, nbobjs,
+					osdist->objs,
+					osdist->distances);
+    }
+  }
+}
+
+/***************************************************
+ * Destroying logical distances attached to objects
+ */
+
+/* destroy an object distances structure */
+void
+hwloc_clear_object_distances_one(struct hwloc_distances_s * distances)
+{
+  free(distances->latency);
+  free(distances);
+}
+
+void
+hwloc_clear_object_distances(hwloc_obj_t obj)
+{
+  unsigned i;
+  for (i=0; i<obj->distances_count; i++)
+    hwloc_clear_object_distances_one(obj->distances[i]);
+  free(obj->distances);
+  obj->distances = NULL;
+  obj->distances_count = 0;
+}
+
+/******************************************
+ * Grouping objects according to distances
+ */
+
+static void hwloc_report_user_distance_error(const char *msg, int line)
+{
+    static int reported = 0;
+
+    if (!reported && !hwloc_hide_errors()) {
+        fprintf(stderr, "****************************************************************************\n");
+        fprintf(stderr, "* hwloc %s has encountered what looks like an error from user-given distances.\n", HWLOC_VERSION);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* %s\n", msg);
+        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* Please make sure that distances given through the interface or environment\n");
+        fprintf(stderr, "* variables do not contradict any other topology information.\n");
+        fprintf(stderr, "****************************************************************************\n");
+        reported = 1;
+    }
+}
+
+static int hwloc_compare_distances(float a, float b, float accuracy)
+{
+  if (accuracy != 0.0 && fabsf(a-b) < a * accuracy)
+    return 0;
+  return a < b ? -1 : a == b ? 0 : 1;
+}
+
+/*
+ * Place objects in groups if they are in a transitive graph of minimal distances.
+ * Return how many groups were created, or 0 if some incomplete distance graphs were found.
+ */
+static unsigned
+hwloc__find_groups_by_min_distance(unsigned nbobjs,
+				   float *_distances,
+				   float accuracy,
+				   unsigned *groupids,
+				   int verbose)
+{
+  float min_distance = FLT_MAX;
+  unsigned groupid = 1;
+  unsigned i,j,k;
+  unsigned skipped = 0;
+
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+
+  memset(groupids, 0, nbobjs*sizeof(*groupids));
+
+  /* find the minimal distance */
+  for(i=0; i<nbobjs; i++)
+    for(j=0; j<nbobjs; j++) /* check the entire matrix, it may not be perfectly symmetric depending on the accuracy */
+      if (i != j && DISTANCE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
+        min_distance = DISTANCE(i, j);
+  hwloc_debug("found minimal distance %f between objects\n", min_distance);
+
+  if (min_distance == FLT_MAX)
+    return 0;
+
+  /* build groups of objects connected with this distance */
+  for(i=0; i<nbobjs; i++) {
+    unsigned size;
+    int firstfound;
+
+    /* if already grouped, skip */
+    if (groupids[i])
+      continue;
+
+    /* start a new group */
+    groupids[i] = groupid;
+    size = 1;
+    firstfound = i;
+
+    while (firstfound != -1) {
+      /* we added new objects to the group, the first one was firstfound.
+       * rescan all connections from these new objects (starting at first found) to any other objects,
+       * so as to find new objects minimally-connected by transivity.
+       */
+      int newfirstfound = -1;
+      for(j=firstfound; j<nbobjs; j++)
+	if (groupids[j] == groupid)
+	  for(k=0; k<nbobjs; k++)
+              if (!groupids[k] && !hwloc_compare_distances(DISTANCE(j, k), min_distance, accuracy)) {
+	      groupids[k] = groupid;
+	      size++;
+	      if (newfirstfound == -1)
+		newfirstfound = k;
+	      if (i == j)
+		hwloc_debug("object %u is minimally connected to %u\n", k, i);
+	      else
+	        hwloc_debug("object %u is minimally connected to %u through %u\n", k, i, j);
+	    }
+      firstfound = newfirstfound;
+    }
+
+    if (size == 1) {
+      /* cancel this useless group, ignore this object and try from the next one */
+      groupids[i] = 0;
+      skipped++;
+      continue;
+    }
+
+    /* valid this group */
+    groupid++;
+    if (verbose)
+      fprintf(stderr, "Found transitive graph with %u objects with minimal distance %f accuracy %f\n",
+	      size, min_distance, accuracy);
+  }
+
+  if (groupid == 2 && !skipped)
+    /* we created a single group containing all objects, ignore it */
+    return 0;
+
+  /* return the last id, since it's also the number of used group ids */
+  return groupid-1;
+}
+
+/* check that the matrix is ok */
+static int
+hwloc__check_grouping_matrix(unsigned nbobjs, float *_distances, float accuracy, int verbose)
+{
+  unsigned i,j;
+  for(i=0; i<nbobjs; i++) {
+    for(j=i+1; j<nbobjs; j++) {
+      /* should be symmetric */
+      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(j, i), accuracy)) {
+	if (verbose)
+	  fprintf(stderr, "Distance matrix asymmetric ([%u,%u]=%f != [%u,%u]=%f), aborting\n",
+		  i, j, DISTANCE(i, j), j, i, DISTANCE(j, i));
+	return -1;
+      }
+      /* diagonal is smaller than everything else */
+      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(i, i), accuracy) <= 0) {
+	if (verbose)
+	  fprintf(stderr, "Distance to self not strictly minimal ([%u,%u]=%f <= [%u,%u]=%f), aborting\n",
+		  i, j, DISTANCE(i, j), i, i, DISTANCE(i, i));
+	return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+/*
+ * Look at object physical distances to group them.
+ */
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology,
+			   unsigned nbobjs,
+			   struct hwloc_obj **objs,
+			   float *_distances,
+			   unsigned nbaccuracies, float *accuracies,
+			   int fromuser,
+			   int needcheck,
+			   int verbose)
+{
+  unsigned *groupids = NULL;
+  unsigned nbgroups = 0;
+  unsigned i,j;
+
+  if (nbobjs <= 2) {
+      return;
+  }
+
+  groupids = malloc(sizeof(unsigned) * nbobjs);
+  if (NULL == groupids) {
+      return;
+  }
+
+  for(i=0; i<nbaccuracies; i++) {
+    if (verbose)
+      fprintf(stderr, "Trying to group %u %s objects according to physical distances with accuracy %f\n",
+	      nbobjs, hwloc_obj_type_string(objs[0]->type), accuracies[i]);
+    if (needcheck && hwloc__check_grouping_matrix(nbobjs, _distances, accuracies[i], verbose) < 0)
+      continue;
+    nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _distances, accuracies[i], groupids, verbose);
+    if (nbgroups)
+      break;
+  }
+  if (!nbgroups)
+    goto outter_free;
+
+  /* For convenience, put these declarations inside a block.  It's a
+     crying shame we can't use C99 syntax here, and have to do a bunch
+     of mallocs. :-( */
+  {
+      hwloc_obj_t *groupobjs = NULL;
+      unsigned *groupsizes = NULL;
+      float *groupdistances = NULL;
+      unsigned failed = 0;
+
+      groupobjs = malloc(sizeof(hwloc_obj_t) * nbgroups);
+      groupsizes = malloc(sizeof(unsigned) * nbgroups);
+      groupdistances = malloc(sizeof(float) * nbgroups * nbgroups);
+      if (NULL == groupobjs || NULL == groupsizes || NULL == groupdistances) {
+          goto inner_free;
+      }
+      /* create new Group objects and record their size */
+      memset(&(groupsizes[0]), 0, sizeof(groupsizes[0]) * nbgroups);
+      for(i=0; i<nbgroups; i++) {
+          /* create the Group object */
+          hwloc_obj_t group_obj, res_obj;
+          group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+          group_obj->cpuset = hwloc_bitmap_alloc();
+          group_obj->attr->group.depth = topology->next_group_depth;
+          for (j=0; j<nbobjs; j++)
+	    if (groupids[j] == i+1) {
+	      /* assemble the group sets */
+	      hwloc_obj_add_other_obj_sets(group_obj, objs[j]);
+              groupsizes[i]++;
+            }
+          hwloc_debug_1arg_bitmap("adding Group object with %u objects and cpuset %s\n",
+                                  groupsizes[i], group_obj->cpuset);
+          res_obj = hwloc__insert_object_by_cpuset(topology, group_obj,
+						   fromuser ? hwloc_report_user_distance_error : hwloc_report_os_error);
+	  /* res_obj may be NULL on failure to insert. */
+	  if (!res_obj)
+	    failed++;
+	  /* or it may be different from groupobjs if we got groups from XML import before grouping */
+          groupobjs[i] = res_obj;
+      }
+
+      if (failed)
+	/* don't try to group above if we got a NULL group here, just keep this incomplete level */
+	goto inner_free;
+
+      /* factorize distances */
+      memset(&(groupdistances[0]), 0, sizeof(groupdistances[0]) * nbgroups * nbgroups);
+#undef DISTANCE
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+#define GROUP_DISTANCE(i, j) groupdistances[(i) * nbgroups + (j)]
+      for(i=0; i<nbobjs; i++)
+	if (groupids[i])
+	  for(j=0; j<nbobjs; j++)
+	    if (groupids[j])
+                GROUP_DISTANCE(groupids[i]-1, groupids[j]-1) += DISTANCE(i, j);
+      for(i=0; i<nbgroups; i++)
+          for(j=0; j<nbgroups; j++) {
+              unsigned groupsize = groupsizes[i]*groupsizes[j];
+              float groupsizef = (float) groupsize;
+              GROUP_DISTANCE(i, j) /= groupsizef;
+          }
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "generated new distance matrix between groups:\n");
+      hwloc_debug("%s", "  index");
+      for(j=0; j<nbgroups; j++)
+	hwloc_debug(" % 5d", (int) j); /* print index because os_index is -1 for Groups */
+      hwloc_debug("%s", "\n");
+      for(i=0; i<nbgroups; i++) {
+	hwloc_debug("  % 5d", (int) i);
+	for(j=0; j<nbgroups; j++)
+	  hwloc_debug(" %2.3f", GROUP_DISTANCE(i, j));
+	hwloc_debug("%s", "\n");
+      }
+#endif
+
+      topology->next_group_depth++;
+      hwloc__groups_by_distances(topology, nbgroups, groupobjs, (float*) groupdistances, nbaccuracies, accuracies, fromuser, 0 /* no need to check generated matrix */, verbose);
+
+  inner_free:
+      /* Safely free everything */
+      if (NULL != groupobjs) {
+          free(groupobjs);
+      }
+      if (NULL != groupsizes) {
+          free(groupsizes);
+      }
+      if (NULL != groupdistances) {
+          free(groupdistances);
+      }
+  }
+
+ outter_free:
+  if (NULL != groupids) {
+      free(groupids);
+  }
+}
+
+void
+hwloc_group_by_distances(struct hwloc_topology *topology)
+{
+  unsigned nbobjs;
+  struct hwloc_os_distances_s * osdist;
+  const char *env;
+  float accuracies[5] = { 0.0f, 0.01f, 0.02f, 0.05f, 0.1f };
+  unsigned nbaccuracies = 5;
+  hwloc_obj_t group_obj;
+  int verbose = 0;
+  unsigned i;
+  hwloc_localeswitch_declare;
+#ifdef HWLOC_DEBUG
+  unsigned j;
+#endif
+
+  env = getenv("HWLOC_GROUPING");
+  if (env && !atoi(env))
+    return;
+  /* backward compat with v1.2 */
+  if (getenv("HWLOC_IGNORE_DISTANCES"))
+    return;
+
+  hwloc_localeswitch_init();
+  env = getenv("HWLOC_GROUPING_ACCURACY");
+  if (!env) {
+    /* only use 0.0 */
+    nbaccuracies = 1;
+  } else if (strcmp(env, "try")) {
+    /* use the given value */
+    nbaccuracies = 1;
+    accuracies[0] = (float) atof(env);
+  } /* otherwise try all values */
+  hwloc_localeswitch_fini();
+
+#ifdef HWLOC_DEBUG
+  verbose = 1;
+#else
+  env = getenv("HWLOC_GROUPING_VERBOSE");
+  if (env)
+    verbose = atoi(env);
+#endif
+
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+    nbobjs = osdist->nbobjs;
+    if (!nbobjs)
+      continue;
+
+    if (osdist->objs) {
+      /* if we have objs, we must have distances as well,
+       * thanks to hwloc_convert_distances_indexes_into_objects()
+       */
+      assert(osdist->distances);
+
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "trying to group objects using distance matrix:\n");
+      hwloc_debug("%s", "  index");
+      for(j=0; j<nbobjs; j++)
+	hwloc_debug(" % 5d", (int) osdist->objs[j]->os_index);
+      hwloc_debug("%s", "\n");
+      for(i=0; i<nbobjs; i++) {
+	hwloc_debug("  % 5d", (int) osdist->objs[i]->os_index);
+	for(j=0; j<nbobjs; j++)
+	  hwloc_debug(" %2.3f", osdist->distances[i*nbobjs + j]);
+	hwloc_debug("%s", "\n");
+      }
+#endif
+
+      hwloc__groups_by_distances(topology, nbobjs,
+				 osdist->objs,
+				 osdist->distances,
+				 nbaccuracies, accuracies,
+				 osdist->indexes != NULL,
+				 1 /* check the first matrice */,
+				 verbose);
+
+      /* add a final group object covering everybody so that the distance matrix can be stored somewhere.
+       * this group will be merged into a regular object if the matrix isn't strangely incomplete
+       */
+      group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+      group_obj->attr->group.depth = (unsigned) -1;
+      group_obj->cpuset = hwloc_bitmap_alloc();
+      for(i=0; i<nbobjs; i++) {
+	/* assemble the group sets */
+	hwloc_obj_add_other_obj_sets(group_obj, osdist->objs[i]);
+      }
+      hwloc_debug_1arg_bitmap("adding Group object (as root of distance matrix with %u objects) with cpuset %s\n",
+			      nbobjs, group_obj->cpuset);
+      hwloc__insert_object_by_cpuset(topology, group_obj,
+				     osdist->indexes != NULL ? hwloc_report_user_distance_error : hwloc_report_os_error);
+    }
+  }
+}
diff --git a/ext/hwloc/hwloc/dolib.c b/ext/hwloc/hwloc/dolib.c
new file mode 100644
index 0000000..d5eff58
--- /dev/null
+++ b/ext/hwloc/hwloc/dolib.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009 inria.  All rights reserved.
+ * Copyright © 2009, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/* Wrapper to avoid msys' tendency to turn / into \ and : into ;  */
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  char *prog, *arch, *def, *version, *lib;
+  char s[1024];
+  char name[16];
+  int current, age, revision;
+
+  if (argc != 6) {
+    fprintf(stderr,"bad number of arguments");
+    exit(EXIT_FAILURE);
+  }
+
+  prog = argv[1];
+  arch = argv[2];
+  def = argv[3];
+  version = argv[4];
+  lib = argv[5];
+
+  if (sscanf(version, "%d:%d:%d", &current, &revision, &age) != 3)
+    exit(EXIT_FAILURE);
+
+  snprintf(name, sizeof(name), "libhwloc-%d", current - age);
+  printf("using soname %s\n", name);
+
+  snprintf(s, sizeof(s), "\"%s\" /machine:%s /def:%s /name:%s /out:%s",
+      prog, arch, def, name, lib);
+  if (system(s)) {
+    fprintf(stderr, "%s failed\n", s);
+    exit(EXIT_FAILURE);
+  }
+
+  exit(EXIT_SUCCESS);
+}
diff --git a/ext/hwloc/hwloc/misc.c b/ext/hwloc/hwloc/misc.c
new file mode 100644
index 0000000..3da6687
--- /dev/null
+++ b/ext/hwloc/hwloc/misc.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+#include <stdarg.h>
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+
+#ifdef HAVE_PROGRAM_INVOCATION_NAME
+#include <errno.h>
+extern char *program_invocation_name;
+#endif
+#ifdef HAVE___PROGNAME
+extern char *__progname;
+#endif
+
+int hwloc_snprintf(char *str, size_t size, const char *format, ...)
+{
+  int ret;
+  va_list ap;
+  static char bin;
+  size_t fakesize;
+  char *fakestr;
+
+  /* Some systems crash on str == NULL */
+  if (!size) {
+    str = &bin;
+    size = 1;
+  }
+
+  va_start(ap, format);
+  ret = vsnprintf(str, size, format, ap);
+  va_end(ap);
+
+  if (ret >= 0 && (size_t) ret != size-1)
+    return ret;
+
+  /* vsnprintf returned size-1 or -1. That could be a system which reports the
+   * written data and not the actually required room. Try increasing buffer
+   * size to get the latter. */
+
+  fakesize = size;
+  fakestr = NULL;
+  do {
+    fakesize *= 2;
+    free(fakestr);
+    fakestr = malloc(fakesize);
+    if (NULL == fakestr)
+      return -1;
+    va_start(ap, format);
+    errno = 0;
+    ret = vsnprintf(fakestr, fakesize, format, ap);
+    va_end(ap);
+  } while ((size_t) ret == fakesize-1 || (ret < 0 && (!errno || errno == ERANGE)));
+
+  if (ret >= 0 && size) {
+    if (size > (size_t) ret+1)
+      size = ret+1;
+    memcpy(str, fakestr, size-1);
+    str[size-1] = 0;
+  }
+  free(fakestr);
+
+  return ret;
+}
+
+int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n)
+{
+  size_t i = 0;
+  while (*haystack && *haystack != ':') {
+    int ha = *haystack++;
+    int low_h = tolower(ha);
+    int ne = *needle++;
+    int low_n = tolower(ne);
+    if (low_h != low_n)
+      return 1;
+    i++;
+  }
+  return i < n;
+}
+
+void hwloc_add_uname_info(struct hwloc_topology *topology __hwloc_attribute_unused,
+			  void *cached_uname __hwloc_attribute_unused)
+{
+#ifdef HAVE_UNAME
+  struct utsname _utsname, *utsname;
+
+  if (hwloc_obj_get_info_by_name(topology->levels[0][0], "OSName"))
+    /* don't annotate twice */
+    return;
+
+  if (cached_uname)
+    utsname = (struct utsname *) cached_uname;
+  else {
+    utsname = &_utsname;
+    if (uname(utsname) < 0)
+      return;
+  }
+
+  if (*utsname->sysname)
+    hwloc_obj_add_info(topology->levels[0][0], "OSName", utsname->sysname);
+  if (*utsname->release)
+    hwloc_obj_add_info(topology->levels[0][0], "OSRelease", utsname->release);
+  if (*utsname->version)
+    hwloc_obj_add_info(topology->levels[0][0], "OSVersion", utsname->version);
+  if (*utsname->nodename)
+    hwloc_obj_add_info(topology->levels[0][0], "HostName", utsname->nodename);
+  if (*utsname->machine)
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", utsname->machine);
+#endif /* HAVE_UNAME */
+}
+
+char *
+hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#if HAVE_DECL_GETMODULEFILENAME
+  char name[256], *basename;
+  unsigned res = GetModuleFileName(NULL, name, sizeof(name));
+  if (res == sizeof(name) || !res)
+    return NULL;
+  basename = strrchr(name, '\\');
+  if (!basename)
+    basename = name;
+  else
+    basename++;
+  return strdup(basename);
+#else /* !HAVE_GETMODULEFILENAME */
+  const char *name, *basename;
+#if HAVE_DECL_GETPROGNAME
+  name = getprogname(); /* FreeBSD, NetBSD, some Solaris */
+#elif HAVE_DECL_GETEXECNAME
+  name = getexecname(); /* Solaris */
+#elif defined HAVE_PROGRAM_INVOCATION_NAME
+  name = program_invocation_name; /* Glibc. BGQ CNK. */
+  /* could use program_invocation_short_name directly, but we have the code to remove the path below anyway */
+#elif defined HAVE___PROGNAME
+  name = __progname; /* fallback for most unix, used for OpenBSD */
+#else
+  /* TODO: _NSGetExecutablePath(path, &size) on Darwin */
+  /* TODO: AIX, HPUX, OSF */
+  name = NULL;
+#endif
+  if (!name)
+    return NULL;
+  basename = strrchr(name, '/');
+  if (!basename)
+    basename = name;
+  else
+    basename++;
+  return strdup(basename);
+#endif /* !HAVE_GETMODULEFILENAME */
+}
diff --git a/ext/hwloc/hwloc/pci-common.c b/ext/hwloc/hwloc/pci-common.c
new file mode 100644
index 0000000..1000ca1
--- /dev/null
+++ b/ext/hwloc/hwloc/pci-common.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+#include <private/debug.h>
+
+#ifdef HWLOC_DEBUG
+static void
+hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
+			    struct hwloc_obj *pcidev)
+{
+  char busid[14];
+  hwloc_obj_t parent;
+
+  /* indent */
+  parent = pcidev->parent;
+  while (parent) {
+    hwloc_debug("%s", "  ");
+    parent = parent->parent;
+  }
+
+  snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+           pcidev->attr->pcidev.domain, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
+
+  if (pcidev->type == HWLOC_OBJ_BRIDGE) {
+    if (pcidev->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+      hwloc_debug("HostBridge");
+    else
+      hwloc_debug("Bridge [%04x:%04x]", busid,
+		  pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id);
+    hwloc_debug(" to %04x:[%02x:%02x]\n",
+		pcidev->attr->bridge.downstream.pci.domain, pcidev->attr->bridge.downstream.pci.secondary_bus, pcidev->attr->bridge.downstream.pci.subordinate_bus);
+  } else
+    hwloc_debug("%s Device [%04x:%04x (%04x:%04x) rev=%02x class=%04x]\n", busid,
+		pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id,
+		pcidev->attr->pcidev.subvendor_id, pcidev->attr->pcidev.subdevice_id,
+		pcidev->attr->pcidev.revision, pcidev->attr->pcidev.class_id);
+}
+#endif /* HWLOC_DEBUG */
+
+static void
+hwloc_pci_traverse_lookuposdevices_cb(void * cbdata,
+				      struct hwloc_obj *pcidev)
+{
+  struct hwloc_backend *backend = cbdata;
+
+  if (pcidev->type == HWLOC_OBJ_BRIDGE)
+    return;
+
+  hwloc_backends_notify_new_object(backend, pcidev);
+}
+
+static void
+hwloc_pci__traverse(void * cbdata, struct hwloc_obj *root,
+		    void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+  struct hwloc_obj *child = root->io_first_child;
+  while (child) {
+    cb(cbdata, child);
+    if (child->type == HWLOC_OBJ_BRIDGE)
+      hwloc_pci__traverse(cbdata, child, cb);
+    child = child->next_sibling;
+  }
+}
+
+static void
+hwloc_pci_traverse(void * cbdata, struct hwloc_obj *root,
+		   void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+  hwloc_pci__traverse(cbdata, root, cb);
+}
+
+enum hwloc_pci_busid_comparison_e {
+  HWLOC_PCI_BUSID_LOWER,
+  HWLOC_PCI_BUSID_HIGHER,
+  HWLOC_PCI_BUSID_INCLUDED,
+  HWLOC_PCI_BUSID_SUPERSET
+};
+
+static enum hwloc_pci_busid_comparison_e
+hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
+{
+  if (a->type == HWLOC_OBJ_BRIDGE)
+    assert(a->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+  if (b->type == HWLOC_OBJ_BRIDGE)
+    assert(b->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+
+  if (a->attr->pcidev.domain < b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.domain > b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->type == HWLOC_OBJ_BRIDGE
+      && b->attr->pcidev.bus >= a->attr->bridge.downstream.pci.secondary_bus
+      && b->attr->pcidev.bus <= a->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_SUPERSET;
+  if (b->type == HWLOC_OBJ_BRIDGE
+      && a->attr->pcidev.bus >= b->attr->bridge.downstream.pci.secondary_bus
+      && a->attr->pcidev.bus <= b->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_INCLUDED;
+
+  if (a->attr->pcidev.bus < b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.bus > b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.dev < b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.dev > b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.func < b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.func > b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  /* Should never reach here.  Abort on both debug builds and
+     non-debug builds */
+  assert(0);
+  fprintf(stderr, "Bad assertion in hwloc %s:%d (aborting)\n", __FILE__, __LINE__);
+  exit(1);
+}
+
+static void
+hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
+{
+  struct hwloc_obj **curp, **childp;
+
+  curp = &root->io_first_child;
+  while (*curp) {
+    enum hwloc_pci_busid_comparison_e comp = hwloc_pci_compare_busids(new, *curp);
+    switch (comp) {
+    case HWLOC_PCI_BUSID_HIGHER:
+      /* go further */
+      curp = &(*curp)->next_sibling;
+      continue;
+    case HWLOC_PCI_BUSID_INCLUDED:
+      /* insert new below current bridge */
+      hwloc_pci_add_object(*curp, new);
+      return;
+    case HWLOC_PCI_BUSID_LOWER:
+    case HWLOC_PCI_BUSID_SUPERSET: {
+      /* insert new before current */
+      new->next_sibling = *curp;
+      *curp = new;
+      new->parent = root;
+      if (new->type == HWLOC_OBJ_BRIDGE) {
+	/* look at remaining siblings and move some below new */
+	childp = &new->io_first_child;
+	curp = &new->next_sibling;
+	while (*curp) {
+	  if (hwloc_pci_compare_busids(new, *curp) == HWLOC_PCI_BUSID_LOWER) {
+	    /* this sibling remains under root, after new */
+	    curp = &(*curp)->next_sibling;
+	    /* even if the list is sorted by busid, we can't break because the current bridge creates a bus that may be higher. some object may have to go there */
+	  } else {
+	    /* this sibling goes under new */
+	    *childp = *curp;
+	    *curp = (*curp)->next_sibling;
+	    (*childp)->parent = new;
+	    (*childp)->next_sibling = NULL;
+	    childp = &(*childp)->next_sibling;
+	  }
+	}
+      }
+      return;
+    }
+    }
+  }
+  /* add to the end of the list if higher than everybody */
+  new->parent = root;
+  new->next_sibling = NULL;
+  *curp = new;
+}
+
+static struct hwloc_obj *
+hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
+				  struct hwloc_obj *hostbridge,
+				  struct hwloc_obj *parent)
+{
+  /* Xeon E5v3 in cluster-on-die mode only have PCI on the first NUMA node of each package.
+   * but many dual-processor host report the second PCI hierarchy on 2nd NUMA of first package.
+   */
+  if (parent->depth >= 2
+      && parent->type == HWLOC_OBJ_NUMANODE
+      && parent->sibling_rank == 1 && parent->parent->arity == 2
+      && parent->parent->type == HWLOC_OBJ_PACKAGE
+      && parent->parent->sibling_rank == 0 && parent->parent->parent->arity == 2) {
+    const char *cpumodel = hwloc_obj_get_info_by_name(parent->parent, "CPUModel");
+    if (cpumodel && strstr(cpumodel, "Xeon")) {
+      if (!hwloc_hide_errors()) {
+	fprintf(stderr, "****************************************************************************\n");
+	fprintf(stderr, "* hwloc %s has encountered an incorrect PCI locality information.\n", HWLOC_VERSION);
+	fprintf(stderr, "* PCI bus %04x:%02x is supposedly close to 2nd NUMA node of 1st package,\n",
+		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+	fprintf(stderr, "* however hwloc believes this is impossible on this architecture.\n");
+	fprintf(stderr, "* Therefore the PCI bus will be moved to 1st NUMA node of 2nd package.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* If you feel this fixup is wrong, disable it by setting in your environment\n");
+	fprintf(stderr, "* HWLOC_PCI_%04x_%02x_LOCALCPUS= (empty value), and report the problem\n",
+		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+	fprintf(stderr, "* to the hwloc's user mailing list together with the XML output of lstopo.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* You may silence this message by setting HWLOC_HIDE_ERRORS=1 in your environment.\n");
+	fprintf(stderr, "****************************************************************************\n");
+      }
+      return parent->parent->next_sibling->first_child;
+    }
+  }
+
+  return parent;
+}
+
+static struct hwloc_obj *
+hwloc_pci_find_hostbridge_parent(struct hwloc_topology *topology, struct hwloc_backend *backend,
+				 struct hwloc_obj *hostbridge)
+{
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  hwloc_obj_t group_obj, parent;
+  const char *env;
+  int err;
+
+  /* override the cpuset with the environment if given */
+  int forced = 0;
+  char envname[256];
+  snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
+	   hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+  env = getenv(envname);
+  if (env)
+    /* if env exists but is empty, don't let quirks change what the OS reports */
+    forced = 1;
+  if (env && *env) {
+    /* force the hostbridge cpuset */
+    hwloc_debug("Overriding localcpus using %s in the environment\n", envname);
+    hwloc_bitmap_sscanf(cpuset, env);
+  } else {
+    /* get the hostbridge cpuset by acking the OS backend.
+     * it's not a PCI device, so we use its first child locality info.
+     */
+    err = hwloc_backends_get_obj_cpuset(backend, hostbridge->io_first_child, cpuset);
+    if (err < 0)
+      /* if we got nothing, assume the hostbridge is attached to the top of hierarchy */
+      hwloc_bitmap_copy(cpuset, hwloc_topology_get_topology_cpuset(topology));
+  }
+
+  hwloc_debug_bitmap("Attaching hostbridge to cpuset %s\n", cpuset);
+
+  /* restrict to the existing complete cpuset to avoid errors later */
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+  /* if the remaining cpuset is empty, take the root */
+  if (hwloc_bitmap_iszero(cpuset))
+    hwloc_bitmap_copy(cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+  group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+  if (group_obj) {
+    group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
+    hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
+    group_obj->cpuset = hwloc_bitmap_dup(cpuset);
+    group_obj->attr->group.depth = (unsigned) -1;
+    parent = hwloc__insert_object_by_cpuset(topology, group_obj, hwloc_report_os_error);
+    if (parent == group_obj) {
+      /* group inserted without being merged, setup its sets */
+      hwloc_obj_add_children_sets(group_obj);
+    } else if (!parent) {
+      /* Failed to insert the parent, maybe a conflicting cpuset, attach to the root object instead */
+      parent = hwloc_get_root_obj(topology);
+    } else {
+      /* Got merged. This object has the right cpuset, but it could be a cache or so,
+       * go up as long as the (complete)cpuset is the same.
+       */
+      while (parent->parent) {
+	if (parent->complete_cpuset && parent->parent->complete_cpuset) {
+	  if (!hwloc_bitmap_isequal(parent->complete_cpuset, parent->parent->complete_cpuset))
+	    break;
+	} else {
+	  if (!hwloc_bitmap_isequal(parent->cpuset, parent->parent->cpuset))
+	    break;
+	}
+	parent = parent->parent;
+      }
+
+      if (!forced)
+	parent = hwloc_pci_fixup_hostbridge_parent(topology, hostbridge, parent);
+    }
+  } else {
+    /* Failed to create the Group, attach to the root object instead */
+    parent = hwloc_get_root_obj(topology);
+  }
+
+  hwloc_bitmap_free(cpuset);
+
+  return parent;
+}
+
+int
+hwloc_insert_pci_device_list(struct hwloc_backend *backend,
+			     struct hwloc_obj *first_obj)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_obj fakeparent;
+  struct hwloc_obj *obj;
+  unsigned current_hostbridge;
+
+  if (!first_obj)
+    /* found nothing, exit */
+    return 0;
+
+  /* first, organise object as tree under a fake parent object */
+  fakeparent.parent = NULL;
+  fakeparent.io_first_child = NULL;
+  while (first_obj) {
+    obj = first_obj;
+    first_obj = obj->next_sibling;
+    hwloc_pci_add_object(&fakeparent, obj);
+  }
+
+#ifdef HWLOC_DEBUG
+  hwloc_debug("%s", "\nPCI hierarchy under fake parent:\n");
+  hwloc_pci_traverse(NULL, &fakeparent, hwloc_pci_traverse_print_cb);
+  hwloc_debug("%s", "\n");
+#endif
+
+  /* walk the hierarchy, and lookup OS devices */
+  hwloc_pci_traverse(backend, &fakeparent, hwloc_pci_traverse_lookuposdevices_cb);
+
+  /*
+   * fakeparent lists all objects connected to any upstream bus in the machine.
+   * We now create one real hostbridge object per upstream bus.
+   * It's not actually a PCI device so we have to create it.
+   */
+  current_hostbridge = 0;
+  while (fakeparent.io_first_child) {
+    /* start a new host bridge */
+    struct hwloc_obj *hostbridge = hwloc_alloc_setup_object(HWLOC_OBJ_BRIDGE, current_hostbridge++);
+    struct hwloc_obj **dstnextp = &hostbridge->io_first_child;
+    struct hwloc_obj **srcnextp = &fakeparent.io_first_child;
+    struct hwloc_obj *child = *srcnextp;
+    struct hwloc_obj *parent;
+    unsigned short current_domain = child->attr->pcidev.domain;
+    unsigned char current_bus = child->attr->pcidev.bus;
+    unsigned char current_subordinate = current_bus;
+
+    hwloc_debug("Starting new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+
+  next_child:
+    /* remove next child from fakeparent */
+    *srcnextp = child->next_sibling;
+    /* append it to hostbridge */
+    *dstnextp = child;
+    child->parent = hostbridge;
+    child->next_sibling = NULL;
+    dstnextp = &child->next_sibling;
+
+    /* compute hostbridge secondary/subordinate buses */
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
+      current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+
+    /* use next child if it has the same domains/bus */
+    child = *srcnextp;
+    if (child
+	&& child->attr->pcidev.domain == current_domain
+	&& child->attr->pcidev.bus == current_bus)
+      goto next_child;
+
+    /* finish setting up this hostbridge */
+    hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
+    hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+    hostbridge->attr->bridge.downstream.pci.domain = current_domain;
+    hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
+    hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
+    hwloc_debug("New PCI hostbridge %04x:[%02x-%02x]\n",
+		current_domain, current_bus, current_subordinate);
+
+    /* attach the hostbridge where it belongs */
+    parent = hwloc_pci_find_hostbridge_parent(topology, backend, hostbridge);
+    hwloc_insert_object_by_parent(topology, parent, hostbridge);
+  }
+
+  return 1;
+}
+
+#define HWLOC_PCI_STATUS 0x06
+#define HWLOC_PCI_STATUS_CAP_LIST 0x10
+#define HWLOC_PCI_CAPABILITY_LIST 0x34
+#define HWLOC_PCI_CAP_LIST_ID 0
+#define HWLOC_PCI_CAP_LIST_NEXT 1
+
+unsigned
+hwloc_pci_find_cap(const unsigned char *config, unsigned cap)
+{
+  unsigned char seen[256] = { 0 };
+  unsigned char ptr; /* unsigned char to make sure we stay within the 256-byte config space */
+
+  if (!(config[HWLOC_PCI_STATUS] & HWLOC_PCI_STATUS_CAP_LIST))
+    return 0;
+
+  for (ptr = config[HWLOC_PCI_CAPABILITY_LIST] & ~3;
+       ptr; /* exit if next is 0 */
+       ptr = config[ptr + HWLOC_PCI_CAP_LIST_NEXT] & ~3) {
+    unsigned char id;
+
+    /* Looped around! */
+    if (seen[ptr])
+      break;
+    seen[ptr] = 1;
+
+    id = config[ptr + HWLOC_PCI_CAP_LIST_ID];
+    if (id == cap)
+      return ptr;
+    if (id == 0xff) /* exit if id is 0 or 0xff */
+      break;
+  }
+  return 0;
+}
+
+#define HWLOC_PCI_EXP_LNKSTA 0x12
+#define HWLOC_PCI_EXP_LNKSTA_SPEED 0x000f
+#define HWLOC_PCI_EXP_LNKSTA_WIDTH 0x03f0
+
+int
+hwloc_pci_find_linkspeed(const unsigned char *config,
+			 unsigned offset, float *linkspeed)
+{
+  unsigned linksta, speed, width;
+  float lanespeed;
+
+  memcpy(&linksta, &config[offset + HWLOC_PCI_EXP_LNKSTA], 4);
+  speed = linksta & HWLOC_PCI_EXP_LNKSTA_SPEED; /* PCIe generation */
+  width = (linksta & HWLOC_PCI_EXP_LNKSTA_WIDTH) >> 4; /* how many lanes */
+  /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
+   * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
+   * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
+   */
+  lanespeed = speed <= 2 ? 2.5 * speed * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
+  *linkspeed = lanespeed * width / 8; /* GB/s */
+  return 0;
+}
+
+#define HWLOC_PCI_HEADER_TYPE 0x0e
+#define HWLOC_PCI_HEADER_TYPE_BRIDGE 1
+#define HWLOC_PCI_CLASS_BRIDGE_PCI 0x0604
+#define HWLOC_PCI_PRIMARY_BUS 0x18
+#define HWLOC_PCI_SECONDARY_BUS 0x19
+#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
+
+int
+hwloc_pci_prepare_bridge(hwloc_obj_t obj,
+			 const unsigned char *config)
+{
+  unsigned char headertype;
+  unsigned isbridge;
+  struct hwloc_pcidev_attr_s *pattr = &obj->attr->pcidev;
+  struct hwloc_bridge_attr_s *battr;
+
+  headertype = config[HWLOC_PCI_HEADER_TYPE] & 0x7f;
+  isbridge = (pattr->class_id == HWLOC_PCI_CLASS_BRIDGE_PCI
+	      && headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE);
+
+  if (!isbridge)
+    return 0;
+
+  battr = &obj->attr->bridge;
+
+  if (config[HWLOC_PCI_PRIMARY_BUS] != pattr->bus)
+    hwloc_debug("  %04x:%02x:%02x.%01x bridge with (ignored) invalid PCI_PRIMARY_BUS %02x\n",
+		pattr->domain, pattr->bus, pattr->dev, pattr->func, config[HWLOC_PCI_PRIMARY_BUS]);
+
+  obj->type = HWLOC_OBJ_BRIDGE;
+  battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream.pci.domain = pattr->domain;
+  battr->downstream.pci.secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
+  battr->downstream.pci.subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
+
+  return 0;
+}
diff --git a/ext/hwloc/hwloc/topology-bgq.cb b/ext/hwloc/hwloc/topology-bgq.cb
new file mode 100644
index 0000000..3998f31
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-bgq.cb
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/utsname.h>
+#include <spi/include/kernel/location.h>
+#include <spi/include/kernel/process.h>
+
+static int
+hwloc_look_bgq(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  unsigned i;
+  const char *env;
+
+  if (!topology->levels[0][0]->cpuset) {
+    /* Nobody created objects yet, setup everything */
+    hwloc_bitmap_t set;
+    hwloc_obj_t obj;
+
+#define HWLOC_BGQ_CORES 17 /* spare core ignored for now */
+
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+    /* mark the 17th core (OS-reserved) as disallowed */
+    hwloc_bitmap_clr_range(topology->levels[0][0]->allowed_cpuset, (HWLOC_BGQ_CORES-1)*4, HWLOC_BGQ_CORES*4-1);
+
+    env = getenv("BG_THREADMODEL");
+    if (!env || atoi(env) != 2) {
+      /* process cannot use cores/threads outside of its Kernel_ThreadMask() */
+      uint64_t bgmask = Kernel_ThreadMask(Kernel_MyTcoord());
+      /* the mask is reversed, manually reverse it */
+      for(i=0; i<64; i++)
+	if (((bgmask >> i) & 1) == 0)
+	  hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, 63-i);
+    }
+
+    /* a single memory bank */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+    obj->cpuset = set;
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(set, 0);
+    obj->nodeset = set;
+    obj->memory.local_memory = 16ULL*1024*1024*1024ULL;
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* package */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, 0);
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+    obj->cpuset = set;
+    hwloc_obj_add_info(obj, "CPUModel", "IBM PowerPC A2");
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* shared L2 */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+    obj->cpuset = hwloc_bitmap_dup(set);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 32*1024*1024;
+    obj->attr->cache.linesize = 128;
+    obj->attr->cache.associativity = 16;
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* Cores */
+    for(i=0; i<HWLOC_BGQ_CORES; i++) {
+      /* Core */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+      set = hwloc_bitmap_alloc();
+      hwloc_bitmap_set_range(set, i*4, i*4+3);
+      obj->cpuset = set;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* L1d */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 16*1024;
+      obj->attr->cache.linesize = 64;
+      obj->attr->cache.associativity = 8;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* L1i */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 16*1024;
+      obj->attr->cache.linesize = 64;
+      obj->attr->cache.associativity = 4;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* there's also a L1p "prefetch cache" of 4kB with 128B lines */
+    }
+
+    /* PUs */
+    hwloc_setup_pu_level(topology, HWLOC_BGQ_CORES*4);
+  }
+
+  /* Add BGQ specific information */
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "BGQ");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+static int
+hwloc_bgq_get_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  unsigned pu;
+  cpu_set_t bg_set;
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  err = pthread_getaffinity_np(thread, sizeof(bg_set), &bg_set);
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  for(pu=0; pu<64; pu++)
+    if (CPU_ISSET(pu, &bg_set)) {
+      /* the binding cannot contain multiple PUs */
+      hwloc_bitmap_only(hwloc_set, pu);
+      break;
+    }
+  return 0;
+}
+
+static int
+hwloc_bgq_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  hwloc_bitmap_only(hwloc_set, Kernel_ProcessorID());
+  return 0;
+}
+
+static int
+hwloc_bgq_set_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  unsigned pu;
+  cpu_set_t bg_set;
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  /* the binding cannot contain multiple PUs.
+   * keep the first PU only, and error out if STRICT.
+   */
+  if (hwloc_bitmap_weight(hwloc_set) != 1) {
+    if ((flags & HWLOC_CPUBIND_STRICT)) {
+      errno = ENOSYS;
+      return -1;
+    }
+  }
+  pu = hwloc_bitmap_first(hwloc_set);
+  CPU_ZERO(&bg_set);
+  CPU_SET(pu, &bg_set);
+  err = pthread_setaffinity_np(thread, sizeof(bg_set), &bg_set);
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  return 0;
+}
+
+static int
+hwloc_bgq_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_bgq_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+void
+hwloc_set_bgq_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+		    struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisthread_cpubind = hwloc_bgq_set_thisthread_cpubind;
+  hooks->set_thread_cpubind = hwloc_bgq_set_thread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_bgq_get_thisthread_cpubind;
+  hooks->get_thread_cpubind = hwloc_bgq_get_thread_cpubind;
+  /* threads cannot be bound to more than one PU, so get_last_cpu_location == get_cpubind */
+  hooks->get_thisthread_last_cpu_location = hwloc_bgq_get_thisthread_cpubind;
+  /* hooks->get_thread_last_cpu_location = hwloc_bgq_get_thread_cpubind; */
+}
+
+static struct hwloc_backend *
+hwloc_bgq_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct utsname utsname;
+  struct hwloc_backend *backend;
+  const char *env;
+  int err;
+
+  env = getenv("HWLOC_FORCE_BGQ");
+  if (!env || !atoi(env)) {
+    err = uname(&utsname);
+    if (err || strcmp(utsname.sysname, "CNK") || strcmp(utsname.machine, "BGQ")) {
+      fprintf(stderr, "*** Found unexpected uname sysname `%s' machine `%s'\n", utsname.sysname, utsname.machine);
+      fprintf(stderr, "*** The BGQ backend is only enabled on compute nodes by default (sysname=CNK machine=BGQ)\n");
+      fprintf(stderr, "*** Set HWLOC_FORCE_BGQ=1 in the environment to enforce the BGQ backend anyway.\n");
+      return NULL;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_bgq;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_bgq_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "bgq",
+  ~0,
+  hwloc_bgq_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_bgq_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_bgq_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-darwin.cb b/ext/hwloc/hwloc/topology-darwin.cb
new file mode 100644
index 0000000..1062a1d
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-darwin.cb
@@ -0,0 +1,307 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Detect topology change: registering for power management changes and check
+ * if for example hw.activecpu changed */
+
+/* Apparently, Darwin people do not _want_ to provide binding functions.  */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+static int
+hwloc_look_darwin(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  int64_t _nprocs;
+  unsigned nprocs;
+  int64_t _npackages;
+  unsigned i, j, cpu;
+  struct hwloc_obj *obj;
+  size_t size;
+  int64_t l1dcachesize, l1icachesize;
+  int64_t cacheways[2];
+  int64_t l2cachesize;
+  int64_t cachelinesize;
+  int64_t memsize;
+  char cpumodel[64];
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  if (hwloc_get_sysctlbyname("hw.ncpu", &_nprocs) || _nprocs <= 0)
+    return -1;
+  nprocs = _nprocs;
+  topology->support.discovery->pu = 1;
+
+  hwloc_debug("%u procs\n", nprocs);
+
+  size = sizeof(cpumodel);
+  if (sysctlbyname("machdep.cpu.brand_string", cpumodel, &size, NULL, 0))
+    cpumodel[0] = '\0';
+
+  if (!hwloc_get_sysctlbyname("hw.packages", &_npackages) && _npackages > 0) {
+    unsigned npackages = _npackages;
+    int64_t _cores_per_package;
+    int64_t _logical_per_package;
+    unsigned logical_per_package;
+
+    hwloc_debug("%u packages\n", npackages);
+
+    if (!hwloc_get_sysctlbyname("machdep.cpu.logical_per_package", &_logical_per_package) && _logical_per_package > 0)
+      logical_per_package = _logical_per_package;
+    else
+      /* Assume the trivia.  */
+      logical_per_package = nprocs / npackages;
+
+    hwloc_debug("%u threads per package\n", logical_per_package);
+
+
+    if (nprocs == npackages * logical_per_package)
+      for (i = 0; i < npackages; i++) {
+        obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, i);
+        obj->cpuset = hwloc_bitmap_alloc();
+        for (cpu = i*logical_per_package; cpu < (i+1)*logical_per_package; cpu++)
+          hwloc_bitmap_set(obj->cpuset, cpu);
+
+        hwloc_debug_1arg_bitmap("package %u has cpuset %s\n",
+                   i, obj->cpuset);
+
+        if (cpumodel[0] != '\0')
+          hwloc_obj_add_info(obj, "CPUModel", cpumodel);
+        hwloc_insert_object_by_cpuset(topology, obj);
+      }
+    else
+      if (cpumodel[0] != '\0')
+        hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+    if (!hwloc_get_sysctlbyname("machdep.cpu.cores_per_package", &_cores_per_package) && _cores_per_package > 0) {
+      unsigned cores_per_package = _cores_per_package;
+      hwloc_debug("%u cores per package\n", cores_per_package);
+
+      if (!(logical_per_package % cores_per_package))
+        for (i = 0; i < npackages * cores_per_package; i++) {
+          obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+          obj->cpuset = hwloc_bitmap_alloc();
+          for (cpu = i*(logical_per_package/cores_per_package);
+               cpu < (i+1)*(logical_per_package/cores_per_package);
+               cpu++)
+            hwloc_bitmap_set(obj->cpuset, cpu);
+
+          hwloc_debug_1arg_bitmap("core %u has cpuset %s\n",
+                     i, obj->cpuset);
+          hwloc_insert_object_by_cpuset(topology, obj);
+        }
+    }
+  } else
+    if (cpumodel[0] != '\0')
+      hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+  if (hwloc_get_sysctlbyname("hw.l1dcachesize", &l1dcachesize))
+    l1dcachesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.l1icachesize", &l1icachesize))
+    l1icachesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.l2cachesize", &l2cachesize))
+    l2cachesize = 0;
+
+  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L1_associativity", &cacheways[0]))
+    cacheways[0] = 0;
+  else if (cacheways[0] == 0xff)
+    cacheways[0] = -1;
+
+  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L2_associativity", &cacheways[1]))
+    cacheways[1] = 0;
+  else if (cacheways[1] == 0xff)
+    cacheways[1] = -1;
+
+  if (hwloc_get_sysctlbyname("hw.cachelinesize", &cachelinesize))
+    cachelinesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.memsize", &memsize))
+    memsize = 0;
+
+  if (!sysctlbyname("hw.cacheconfig", NULL, &size, NULL, 0)) {
+    unsigned n = size / sizeof(uint32_t);
+    uint64_t *cacheconfig = NULL;
+    uint64_t *cachesize = NULL;
+    uint32_t *cacheconfig32 = NULL;
+
+    cacheconfig = malloc(sizeof(uint64_t) * n);
+    if (NULL == cacheconfig) {
+        goto out;
+    }
+    cachesize = malloc(sizeof(uint64_t) * n);
+    if (NULL == cachesize) {
+        goto out;
+    }
+    cacheconfig32 = malloc(sizeof(uint32_t) * n);
+    if (NULL == cacheconfig32) {
+        goto out;
+    }
+
+    if ((!sysctlbyname("hw.cacheconfig", cacheconfig, &size, NULL, 0))) {
+      /* Yeech. Darwin seemingly has changed from 32bit to 64bit integers for
+       * cacheconfig, with apparently no way for detection. Assume the machine
+       * won't have more than 4 billion cpus */
+      if (cacheconfig[0] > 0xFFFFFFFFUL) {
+        memcpy(cacheconfig32, cacheconfig, size);
+        for (i = 0 ; i < size / sizeof(uint32_t); i++)
+          cacheconfig[i] = cacheconfig32[i];
+      }
+
+      memset(cachesize, 0, sizeof(uint64_t) * n);
+      size = sizeof(uint64_t) * n;
+      if (sysctlbyname("hw.cachesize", cachesize, &size, NULL, 0)) {
+        if (n > 0)
+          cachesize[0] = memsize;
+        if (n > 1)
+          cachesize[1] = l1dcachesize;
+        if (n > 2)
+          cachesize[2] = l2cachesize;
+      }
+
+      hwloc_debug("%s", "caches");
+      for (i = 0; i < n && cacheconfig[i]; i++)
+        hwloc_debug(" %"PRIu64"(%"PRIu64"kB)", cacheconfig[i], cachesize[i] / 1024);
+
+      /* Now we know how many caches there are */
+      n = i;
+      hwloc_debug("\n%u cache levels\n", n - 1);
+
+      /* For each cache level (0 is memory) */
+      for (i = 0; i < n; i++) {
+        /* cacheconfig tells us how many cpus share it, let's iterate on each cache */
+        for (j = 0; j < (nprocs / cacheconfig[i]); j++) {
+          obj = hwloc_alloc_setup_object(i?HWLOC_OBJ_CACHE:HWLOC_OBJ_NUMANODE, j);
+          if (!i) {
+            obj->nodeset = hwloc_bitmap_alloc();
+            hwloc_bitmap_set(obj->nodeset, j);
+          }
+          obj->cpuset = hwloc_bitmap_alloc();
+          for (cpu = j*cacheconfig[i];
+               cpu < ((j+1)*cacheconfig[i]);
+               cpu++)
+            hwloc_bitmap_set(obj->cpuset, cpu);
+
+          if (i == 1 && l1icachesize) {
+            /* FIXME assuming that L1i and L1d are shared the same way. Darwin
+             * does not yet provide a way to know.  */
+            hwloc_obj_t l1i = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, j);
+            l1i->cpuset = hwloc_bitmap_dup(obj->cpuset);
+            hwloc_debug_1arg_bitmap("L1icache %u has cpuset %s\n",
+                j, l1i->cpuset);
+            l1i->attr->cache.depth = i;
+            l1i->attr->cache.size = l1icachesize;
+            l1i->attr->cache.linesize = cachelinesize;
+            l1i->attr->cache.associativity = 0;
+            l1i->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+
+            hwloc_insert_object_by_cpuset(topology, l1i);
+          }
+          if (i) {
+            hwloc_debug_2args_bitmap("L%ucache %u has cpuset %s\n",
+                i, j, obj->cpuset);
+            obj->attr->cache.depth = i;
+            obj->attr->cache.size = cachesize[i];
+            obj->attr->cache.linesize = cachelinesize;
+            if (i <= sizeof(cacheways) / sizeof(cacheways[0]))
+              obj->attr->cache.associativity = cacheways[i-1];
+            else
+              obj->attr->cache.associativity = 0;
+            if (i == 1 && l1icachesize)
+              obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+            else
+              obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+          } else {
+            hwloc_debug_1arg_bitmap("node %u has cpuset %s\n",
+                j, obj->cpuset);
+	    obj->memory.local_memory = cachesize[i];
+	    obj->memory.page_types_len = 2;
+	    obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+	    memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+	    obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+	    obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+          }
+
+          hwloc_insert_object_by_cpuset(topology, obj);
+        }
+      }
+    }
+  out:
+    if (NULL != cacheconfig) {
+        free(cacheconfig);
+    }
+    if (NULL != cachesize) {
+        free(cachesize);
+    }
+    if (NULL != cacheconfig32) {
+        free(cacheconfig32);
+    }
+  }
+
+
+  /* add PU objects */
+  hwloc_setup_pu_level(topology, nprocs);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Darwin");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_darwin_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+		       struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+}
+
+static struct hwloc_backend *
+hwloc_darwin_component_instantiate(struct hwloc_disc_component *component,
+				   const void *_data1 __hwloc_attribute_unused,
+				   const void *_data2 __hwloc_attribute_unused,
+				   const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_darwin;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_darwin_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "darwin",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_darwin_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_darwin_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_darwin_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-fake.c b/ext/hwloc/hwloc/topology-fake.c
new file mode 100644
index 0000000..e3e22a0
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-fake.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2012-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+#include <stdlib.h>
+
+static struct hwloc_backend *
+hwloc_fake_component_instantiate(struct hwloc_disc_component *component __hwloc_attribute_unused,
+				 const void *_data1 __hwloc_attribute_unused,
+				 const void *_data2 __hwloc_attribute_unused,
+				 const void *_data3 __hwloc_attribute_unused)
+{
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component instantiated\n");
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_fake_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC, /* so that it's always enabled when using the OS discovery */
+  "fake",
+  0, /* nothing to exclude */
+  hwloc_fake_component_instantiate,
+  100, /* make sure it's loaded before anything conflicting excludes it */
+  NULL
+};
+
+static int
+hwloc_fake_component_init(unsigned long flags)
+{
+  if (flags)
+    return -1;
+  if (hwloc_plugin_check_namespace("fake", "hwloc_backend_alloc") < 0)
+    return -1;
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component initialized\n");
+  return 0;
+}
+
+static void
+hwloc_fake_component_finalize(unsigned long flags)
+{
+  if (flags)
+    return;
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component finalized\n");
+}
+
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_fake_component; /* never linked statically in the core */
+
+const struct hwloc_component hwloc_fake_component = {
+  HWLOC_COMPONENT_ABI,
+  hwloc_fake_component_init, hwloc_fake_component_finalize,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_fake_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-freebsd.cb b/ext/hwloc/hwloc/topology-freebsd.cb
new file mode 100644
index 0000000..d8d4c54
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-freebsd.cb
@@ -0,0 +1,255 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <sys/param.h>
+#include <pthread.h>
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#ifdef HAVE_SYS_CPUSET_H
+#include <sys/cpuset.h>
+#endif
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+static void
+hwloc_freebsd_bsd2hwloc(hwloc_bitmap_t hwloc_cpuset, const cpuset_t *cset)
+{
+  unsigned cpu;
+  hwloc_bitmap_zero(hwloc_cpuset);
+  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, cset))
+      hwloc_bitmap_set(hwloc_cpuset, cpu);
+}
+
+static void
+hwloc_freebsd_hwloc2bsd(hwloc_const_bitmap_t hwloc_cpuset, cpuset_t *cset)
+{
+  unsigned cpu;
+  CPU_ZERO(cset);
+  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+    if (hwloc_bitmap_isset(hwloc_cpuset, cpu))
+      CPU_SET(cpu, cset);
+}
+
+static int
+hwloc_freebsd_set_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  cpuset_t cset;
+
+  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+  if (cpuset_setaffinity(level, which, id, sizeof(cset), &cset))
+    return -1;
+
+  return 0;
+}
+
+static int
+hwloc_freebsd_get_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  cpuset_t cset;
+
+  if (cpuset_getaffinity(level, which, id, sizeof(cset), &cset))
+    return -1;
+
+  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+  return 0;
+}
+
+static int
+hwloc_freebsd_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+#ifdef hwloc_thread_t
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+static int
+hwloc_freebsd_set_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  int err;
+  cpuset_t cset;
+
+  if (!pthread_setaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+  err = pthread_setaffinity_np(tid, sizeof(cset), &cset);
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+
+  return 0;
+}
+#endif
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+static int
+hwloc_freebsd_get_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  int err;
+  cpuset_t cset;
+
+  if (!pthread_getaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  err = pthread_getaffinity_np(tid, sizeof(cset), &cset);
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+
+  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+  return 0;
+}
+#endif
+#endif
+#endif
+
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+static void
+hwloc_freebsd_node_meminfo_info(struct hwloc_topology *topology)
+{
+       int mib[2] = { CTL_HW, HW_PHYSMEM };
+       unsigned long physmem;
+       size_t len = sizeof(physmem);
+       sysctl(mib, 2, &physmem, &len, NULL, 0);
+       topology->levels[0][0]->memory.local_memory = physmem;
+       /* we don't know anything about NUMA nodes in this backend.
+        * let another backend or the core move that memory to the right NUMA node */
+}
+#endif
+
+static int
+hwloc_look_freebsd(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  unsigned nbprocs = hwloc_fallback_nbprocessors(topology);
+
+  if (!topology->levels[0][0]->cpuset) {
+    /* Nobody (even the x86 backend) created objects yet, setup basic objects */
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+    hwloc_setup_pu_level(topology, nbprocs);
+  }
+
+  /* Add FreeBSD specific information */
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+  hwloc_freebsd_node_meminfo_info(topology);
+#endif
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "FreeBSD");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+			struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+  hooks->set_thisproc_cpubind = hwloc_freebsd_set_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = hwloc_freebsd_get_thisproc_cpubind;
+  hooks->set_thisthread_cpubind = hwloc_freebsd_set_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_freebsd_get_thisthread_cpubind;
+  hooks->set_proc_cpubind = hwloc_freebsd_set_proc_cpubind;
+  hooks->get_proc_cpubind = hwloc_freebsd_get_proc_cpubind;
+#ifdef hwloc_thread_t
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+  hooks->set_thread_cpubind = hwloc_freebsd_set_thread_cpubind;
+#endif
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+  hooks->get_thread_cpubind = hwloc_freebsd_get_thread_cpubind;
+#endif
+#endif
+#endif
+  /* TODO: get_last_cpu_location: find out ki_lastcpu */
+}
+
+static struct hwloc_backend *
+hwloc_freebsd_component_instantiate(struct hwloc_disc_component *component,
+				    const void *_data1 __hwloc_attribute_unused,
+				    const void *_data2 __hwloc_attribute_unused,
+				    const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_freebsd;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_freebsd_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "freebsd",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_freebsd_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_freebsd_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_freebsd_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-linux.c b/ext/hwloc/hwloc/topology-linux.c
new file mode 100644
index 0000000..82423ff
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-linux.c
@@ -0,0 +1,5133 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2015 Intel, Inc.  All rights reserved.
+ * Copyright © 2010 IBM
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/linux.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_LIBUDEV_H
+#include <libudev.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+#define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
+#include <numaif.h>
+#endif
+
+struct hwloc_linux_backend_data_s {
+  int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
+  int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
+#ifdef HAVE_LIBUDEV_H
+  struct udev *udev; /* Global udev context */
+#endif
+
+  struct utsname utsname; /* fields contain \0 when unknown */
+
+  int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
+  int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
+  unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
+};
+
+
+
+/***************************
+ * Misc Abstraction layers *
+ ***************************/
+
+#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE__SYSCALL3)
+/* libc doesn't have support for sched_setaffinity, build system call
+ * ourselves: */
+#    include <linux/unistd.h>
+#    ifndef __NR_sched_setaffinity
+#       ifdef __i386__
+#         define __NR_sched_setaffinity 241
+#       elif defined(__x86_64__)
+#         define __NR_sched_setaffinity 203
+#       elif defined(__ia64__)
+#         define __NR_sched_setaffinity 1231
+#       elif defined(__hppa__)
+#         define __NR_sched_setaffinity 211
+#       elif defined(__alpha__)
+#         define __NR_sched_setaffinity 395
+#       elif defined(__s390__)
+#         define __NR_sched_setaffinity 239
+#       elif defined(__sparc__)
+#         define __NR_sched_setaffinity 261
+#       elif defined(__m68k__)
+#         define __NR_sched_setaffinity 311
+#       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#         define __NR_sched_setaffinity 222
+#       elif defined(__arm__)
+#         define __NR_sched_setaffinity 241
+#       elif defined(__cris__)
+#         define __NR_sched_setaffinity 241
+/*#       elif defined(__mips__)
+  #         define __NR_sched_setaffinity TODO (32/64/nabi) */
+#       else
+#         warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
+#         define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+#       endif
+#    endif
+#    ifndef sched_setaffinity
+       _syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, lg, const void *, mask)
+#    endif
+#    ifndef __NR_sched_getaffinity
+#       ifdef __i386__
+#         define __NR_sched_getaffinity 242
+#       elif defined(__x86_64__)
+#         define __NR_sched_getaffinity 204
+#       elif defined(__ia64__)
+#         define __NR_sched_getaffinity 1232
+#       elif defined(__hppa__)
+#         define __NR_sched_getaffinity 212
+#       elif defined(__alpha__)
+#         define __NR_sched_getaffinity 396
+#       elif defined(__s390__)
+#         define __NR_sched_getaffinity 240
+#       elif defined(__sparc__)
+#         define __NR_sched_getaffinity 260
+#       elif defined(__m68k__)
+#         define __NR_sched_getaffinity 312
+#       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#         define __NR_sched_getaffinity 223
+#       elif defined(__arm__)
+#         define __NR_sched_getaffinity 242
+#       elif defined(__cris__)
+#         define __NR_sched_getaffinity 242
+/*#       elif defined(__mips__)
+  #         define __NR_sched_getaffinity TODO (32/64/nabi) */
+#       else
+#         warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
+#         define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+#       endif
+#    endif
+#    ifndef sched_getaffinity
+       _syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, lg, void *, mask)
+#    endif
+#endif
+
+/* Added for ntohl() */
+#include <arpa/inet.h>
+
+#ifdef HAVE_OPENAT
+/* Use our own filesystem functions if we have openat */
+
+static const char *
+hwloc_checkat(const char *path, int fsroot_fd)
+{
+  const char *relative_path;
+  if (fsroot_fd < 0) {
+    errno = EBADF;
+    return NULL;
+  }
+
+  /* Skip leading slashes.  */
+  for (relative_path = path; *relative_path == '/'; relative_path++);
+
+  return relative_path;
+}
+
+static int
+hwloc_openat(const char *path, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return openat (fsroot_fd, relative_path, O_RDONLY);
+}
+
+static FILE *
+hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
+{
+  int fd;
+
+  if (strcmp(mode, "r")) {
+    errno = ENOTSUP;
+    return NULL;
+  }
+
+  fd = hwloc_openat (path, fsroot_fd);
+  if (fd == -1)
+    return NULL;
+
+  return fdopen(fd, mode);
+}
+
+static int
+hwloc_accessat(const char *path, int mode, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return faccessat(fsroot_fd, relative_path, mode, 0);
+}
+
+static int
+hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return fstatat(fsroot_fd, relative_path, st, flags);
+}
+
+static DIR*
+hwloc_opendirat(const char *path, int fsroot_fd)
+{
+  int dir_fd;
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return NULL;
+
+  dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
+  if (dir_fd < 0)
+    return NULL;
+
+  return fdopendir(dir_fd);
+}
+
+#endif /* HAVE_OPENAT */
+
+/* Static inline version of fopen so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_open(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_openat(p, d);
+#else
+    return open(p, O_RDONLY);
+#endif
+}
+
+static __hwloc_inline FILE *
+hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fopenat(p, m, d);
+#else
+    return fopen(p, m);
+#endif
+}
+
+/* Static inline version of access so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_accessat(p, m, d);
+#else
+    return access(p, m);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fstatat(p, st, 0, d);
+#else
+    return stat(p, st);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
+#else
+    return lstat(p, st);
+#endif
+}
+
+/* Static inline version of opendir so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline DIR *
+hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_opendirat(p, d);
+#else
+    return opendir(p);
+#endif
+}
+
+
+/*****************************
+ ******* CpuBind Hooks *******
+ *****************************/
+
+int
+hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  /* The resulting binding is always strict */
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int err;
+
+  last = hwloc_bitmap_last(hwloc_set);
+  if (last == -1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  setsize = CPU_ALLOC_SIZE(last+1);
+  plinux_set = CPU_ALLOC(last+1);
+
+  CPU_ZERO_S(setsize, plinux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET_S(cpu, setsize, plinux_set);
+  hwloc_bitmap_foreach_end();
+
+  err = sched_setaffinity(tid, setsize, plinux_set);
+
+  CPU_FREE(plinux_set);
+  return err;
+#elif defined(HWLOC_HAVE_CPU_SET)
+  cpu_set_t linux_set;
+  unsigned cpu;
+
+  CPU_ZERO(&linux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET(cpu, &linux_set);
+  hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#elif defined(HWLOC_HAVE__SYSCALL3)
+  unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#else /* !_SYSCALL3 */
+  errno = ENOSYS;
+  return -1;
+#endif /* !_SYSCALL3 */
+}
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+static int
+hwloc_linux_parse_cpuset_file(FILE *file, hwloc_bitmap_t set)
+{
+  unsigned long start, stop;
+
+  /* reset to zero first */
+  hwloc_bitmap_zero(set);
+
+  while (fscanf(file, "%lu", &start) == 1)
+  {
+    int c = fgetc(file);
+
+    stop = start;
+
+    if (c == '-') {
+      /* Range */
+      if (fscanf(file, "%lu", &stop) != 1) {
+        /* Expected a number here */
+        errno = EINVAL;
+        return -1;
+      }
+      c = fgetc(file);
+    }
+
+    if (c == EOF || c == '\n') {
+      hwloc_bitmap_set_range(set, start, stop);
+      break;
+    }
+
+    if (c != ',') {
+      /* Expected EOF, EOL, or a comma */
+      errno = EINVAL;
+      return -1;
+    }
+
+    hwloc_bitmap_set_range(set, start, stop);
+  }
+
+  return 0;
+}
+
+/*
+ * On some kernels, sched_getaffinity requires the output size to be larger
+ * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
+ * Try sched_affinity on ourself until we find a nr_cpus value that makes
+ * the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
+{
+  static int _nr_cpus = -1;
+  int nr_cpus = _nr_cpus;
+  FILE *possible;
+
+  if (nr_cpus != -1)
+    /* already computed */
+    return nr_cpus;
+
+  if (topology->levels[0][0]->complete_cpuset)
+    /* start with a nr_cpus that may contain the whole topology */
+    nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
+  if (nr_cpus <= 0)
+    /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
+    nr_cpus = 1;
+
+  possible = fopen("/sys/devices/system/cpu/possible", "r");
+  if (possible) {
+    hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
+    if (hwloc_linux_parse_cpuset_file(possible, possible_bitmap) == 0) {
+      int max_possible = hwloc_bitmap_last(possible_bitmap);
+
+      hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
+
+      if (nr_cpus < max_possible + 1)
+        nr_cpus = max_possible + 1;
+    }
+    fclose(possible);
+    hwloc_bitmap_free(possible_bitmap);
+  }
+
+  while (1) {
+    cpu_set_t *set = CPU_ALLOC(nr_cpus);
+    size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
+    int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
+    CPU_FREE(set);
+    nr_cpus = setsize * 8; /* that's the value that was actually tested */
+    if (!err)
+      /* found it */
+      return _nr_cpus = nr_cpus;
+    nr_cpus *= 2;
+  }
+}
+#endif
+
+int
+hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  int err __hwloc_attribute_unused;
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int kernel_nr_cpus;
+
+  /* find the kernel nr_cpus so as to use a large enough cpu_set size */
+  kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
+  setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
+  plinux_set = CPU_ALLOC(kernel_nr_cpus);
+
+  err = sched_getaffinity(tid, setsize, plinux_set);
+
+  if (err < 0) {
+    CPU_FREE(plinux_set);
+    return -1;
+  }
+
+  last = -1;
+  if (topology->levels[0][0]->complete_cpuset)
+    last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+  if (last == -1)
+    /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
+    last = kernel_nr_cpus-1;
+
+  hwloc_bitmap_zero(hwloc_set);
+  for(cpu=0; cpu<=(unsigned) last; cpu++)
+    if (CPU_ISSET_S(cpu, setsize, plinux_set))
+      hwloc_bitmap_set(hwloc_set, cpu);
+
+  CPU_FREE(plinux_set);
+#elif defined(HWLOC_HAVE_CPU_SET)
+  cpu_set_t linux_set;
+  unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  err = sched_getaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  if (err < 0)
+    return -1;
+
+  hwloc_bitmap_zero(hwloc_set);
+  for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, &linux_set))
+      hwloc_bitmap_set(hwloc_set, cpu);
+#elif defined(HWLOC_HAVE__SYSCALL3)
+  unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  err = sched_getaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  if (err < 0)
+    return -1;
+
+  hwloc_bitmap_from_ulong(hwloc_set, mask);
+#else /* !_SYSCALL3 */
+  errno = ENOSYS;
+  return -1;
+#endif /* !_SYSCALL3 */
+
+  return 0;
+}
+
+/* Get the array of tids of a process from the task directory in /proc */
+static int
+hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
+{
+  struct dirent *dirent;
+  unsigned nr_tids = 0;
+  unsigned max_tids = 32;
+  pid_t *tids;
+  struct stat sb;
+
+  /* take the number of links as a good estimate for the number of tids */
+  if (fstat(dirfd(taskdir), &sb) == 0)
+    max_tids = sb.st_nlink;
+
+  tids = malloc(max_tids*sizeof(pid_t));
+  if (!tids) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  rewinddir(taskdir);
+
+  while ((dirent = readdir(taskdir)) != NULL) {
+    if (nr_tids == max_tids) {
+      pid_t *newtids;
+      max_tids += 8;
+      newtids = realloc(tids, max_tids*sizeof(pid_t));
+      if (!newtids) {
+        free(tids);
+        errno = ENOMEM;
+        return -1;
+      }
+      tids = newtids;
+    }
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+    tids[nr_tids++] = atoi(dirent->d_name);
+  }
+
+  *nr_tidsp = nr_tids;
+  *tidsp = tids;
+  return 0;
+}
+
+/* Per-tid callbacks */
+typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
+
+static int
+hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
+			     pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
+			     void *data)
+{
+  char taskdir_path[128];
+  DIR *taskdir;
+  pid_t *tids, *newtids;
+  unsigned i, nr, newnr, failed = 0, failed_errno = 0;
+  unsigned retrynr = 0;
+  int err;
+
+  if (pid)
+    snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
+  else
+    snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
+
+  taskdir = opendir(taskdir_path);
+  if (!taskdir) {
+    if (errno == ENOENT)
+      errno = EINVAL;
+    err = -1;
+    goto out;
+  }
+
+  /* read the current list of threads */
+  err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
+  if (err < 0)
+    goto out_with_dir;
+
+ retry:
+  /* apply the callback to all threads */
+  failed=0;
+  for(i=0; i<nr; i++) {
+    err = cb(topology, tids[i], data, i);
+    if (err < 0) {
+      failed++;
+      failed_errno = errno;
+    }
+  }
+
+  /* re-read the list of thread */
+  err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
+  if (err < 0)
+    goto out_with_tids;
+  /* retry if the list changed in the meantime, or we failed for *some* threads only.
+   * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
+   */
+  if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
+    free(tids);
+    tids = newtids;
+    nr = newnr;
+    if (++retrynr > 10) {
+      /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
+      errno = EAGAIN;
+      err = -1;
+      goto out_with_tids;
+    }
+    goto retry;
+  } else {
+    free(newtids);
+  }
+
+  /* if all threads failed, return the last errno. */
+  if (failed) {
+    err = -1;
+    errno = failed_errno;
+    goto out_with_tids;
+  }
+
+  err = 0;
+ out_with_tids:
+  free(tids);
+ out_with_dir:
+  closedir(taskdir);
+ out:
+  return err;
+}
+
+/* Per-tid proc_set_cpubind callback and caller.
+ * Callback data is a hwloc_bitmap_t. */
+static int
+hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
+{
+  return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
+}
+
+static int
+hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  return hwloc_linux_foreach_proc_tid(topology, pid,
+				      hwloc_linux_foreach_proc_tid_set_cpubind_cb,
+				      (void*) hwloc_set);
+}
+
+/* Per-tid proc_get_cpubind callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
+  hwloc_bitmap_t cpuset;
+  hwloc_bitmap_t tidset;
+  int flags;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+  struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
+  hwloc_bitmap_t cpuset = data->cpuset;
+  hwloc_bitmap_t tidset = data->tidset;
+  int flags = data->flags;
+
+  if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
+    return -1;
+
+  /* reset the cpuset on first iteration */
+  if (!idx)
+    hwloc_bitmap_zero(cpuset);
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    /* if STRICT, we want all threads to have the same binding */
+    if (!idx) {
+      /* this is the first thread, copy its binding */
+      hwloc_bitmap_copy(cpuset, tidset);
+    } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
+      /* this is not the first thread, and it's binding is different */
+      errno = EXDEV;
+      return -1;
+    }
+  } else {
+    /* if not STRICT, just OR all thread bindings */
+    hwloc_bitmap_or(cpuset, cpuset, tidset);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
+  hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+  int ret;
+
+  data.cpuset = hwloc_set;
+  data.tidset = tidset;
+  data.flags = flags;
+  ret = hwloc_linux_foreach_proc_tid(topology, pid,
+				     hwloc_linux_foreach_proc_tid_get_cpubind_cb,
+				     (void*) &data);
+  hwloc_bitmap_free(tidset);
+  return ret;
+}
+
+static int
+hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+}
+
+static int
+hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+}
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!pthread_self) {
+    /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+    errno = ENOSYS;
+    return -1;
+  }
+  if (tid == pthread_self())
+    return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+
+  if (!pthread_setaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t *plinux_set;
+     unsigned cpu;
+     int last;
+     size_t setsize;
+
+     last = hwloc_bitmap_last(hwloc_set);
+     if (last == -1) {
+       errno = EINVAL;
+       return -1;
+     }
+
+     setsize = CPU_ALLOC_SIZE(last+1);
+     plinux_set = CPU_ALLOC(last+1);
+
+     CPU_ZERO_S(setsize, plinux_set);
+     hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+         CPU_SET_S(cpu, setsize, plinux_set);
+     hwloc_bitmap_foreach_end();
+
+     err = pthread_setaffinity_np(tid, setsize, plinux_set);
+
+     CPU_FREE(plinux_set);
+  }
+#elif defined(HWLOC_HAVE_CPU_SET)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t linux_set;
+     unsigned cpu;
+
+     CPU_ZERO(&linux_set);
+     hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+         CPU_SET(cpu, &linux_set);
+     hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+     err = pthread_setaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  }
+#else /* CPU_SET */
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+      unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+      err = pthread_setaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  }
+#endif /* CPU_SET */
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!pthread_self) {
+    /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+    errno = ENOSYS;
+    return -1;
+  }
+  if (tid == pthread_self())
+    return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+
+  if (!pthread_getaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t *plinux_set;
+     unsigned cpu;
+     int last;
+     size_t setsize;
+
+     last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+     assert (last != -1);
+
+     setsize = CPU_ALLOC_SIZE(last+1);
+     plinux_set = CPU_ALLOC(last+1);
+
+     err = pthread_getaffinity_np(tid, setsize, plinux_set);
+     if (err) {
+        CPU_FREE(plinux_set);
+        errno = err;
+        return -1;
+     }
+
+     hwloc_bitmap_zero(hwloc_set);
+     for(cpu=0; cpu<=(unsigned) last; cpu++)
+       if (CPU_ISSET_S(cpu, setsize, plinux_set))
+	 hwloc_bitmap_set(hwloc_set, cpu);
+
+     CPU_FREE(plinux_set);
+  }
+#elif defined(HWLOC_HAVE_CPU_SET)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t linux_set;
+     unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+     err = pthread_getaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     if (err) {
+        errno = err;
+        return -1;
+     }
+
+     hwloc_bitmap_zero(hwloc_set);
+     for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+       if (CPU_ISSET(cpu, &linux_set))
+	 hwloc_bitmap_set(hwloc_set, cpu);
+  }
+#else /* CPU_SET */
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+      unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+      err = pthread_getaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      if (err) {
+        errno = err;
+        return -1;
+      }
+
+     hwloc_bitmap_from_ulong(hwloc_set, mask);
+  }
+#endif /* CPU_SET */
+
+  return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+
+int
+hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
+{
+  /* read /proc/pid/stat.
+   * its second field contains the command name between parentheses,
+   * and the command itself may contain parentheses,
+   * so read the whole line and find the last closing parenthesis to find the third field.
+   */
+  char buf[1024] = "";
+  char name[64];
+  char *tmp;
+  FILE *file;
+  int i;
+
+  if (!tid) {
+#ifdef SYS_gettid
+    tid = syscall(SYS_gettid);
+#else
+    errno = ENOSYS;
+    return -1;
+#endif
+  }
+
+  snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
+  file = fopen(name, "r");
+  if (!file) {
+    errno = ENOSYS;
+    return -1;
+  }
+  tmp = fgets(buf, sizeof(buf), file);
+  fclose(file);
+  if (!tmp) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  tmp = strrchr(buf, ')');
+  if (!tmp) {
+    errno = ENOSYS;
+    return -1;
+  }
+  /* skip ') ' to find the actual third argument */
+  tmp += 2;
+
+  /* skip 35 fields */
+  for(i=0; i<36; i++) {
+    tmp = strchr(tmp, ' ');
+    if (!tmp) {
+      errno = ENOSYS;
+      return -1;
+    }
+    /* skip the ' ' itself */
+    tmp++;
+  }
+
+  /* read the last cpu in the 38th field now */
+  if (sscanf(tmp, "%d ", &i) != 1) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  hwloc_bitmap_only(set, i);
+  return 0;
+}
+
+/* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
+  hwloc_bitmap_t cpuset;
+  hwloc_bitmap_t tidset;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+  struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
+  hwloc_bitmap_t cpuset = data->cpuset;
+  hwloc_bitmap_t tidset = data->tidset;
+
+  if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
+    return -1;
+
+  /* reset the cpuset on first iteration */
+  if (!idx)
+    hwloc_bitmap_zero(cpuset);
+
+  hwloc_bitmap_or(cpuset, cpuset, tidset);
+  return 0;
+}
+
+static int
+hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
+  hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+  int ret;
+
+  data.cpuset = hwloc_set;
+  data.tidset = tidset;
+  ret = hwloc_linux_foreach_proc_tid(topology, pid,
+				     hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
+				     &data);
+  hwloc_bitmap_free(tidset);
+  return ret;
+}
+
+static int
+hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
+}
+
+
+
+/***************************
+ ****** Membind hooks ******
+ ***************************/
+
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+static int
+hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
+{
+  switch (policy) {
+  case HWLOC_MEMBIND_DEFAULT:
+  case HWLOC_MEMBIND_FIRSTTOUCH:
+    *linuxpolicy = MPOL_DEFAULT;
+    break;
+  case HWLOC_MEMBIND_BIND:
+    if (flags & HWLOC_MEMBIND_STRICT)
+      *linuxpolicy = MPOL_BIND;
+    else
+      *linuxpolicy = MPOL_PREFERRED;
+    break;
+  case HWLOC_MEMBIND_INTERLEAVE:
+    *linuxpolicy = MPOL_INTERLEAVE;
+    break;
+  /* TODO: next-touch when (if?) patch applied upstream */
+  default:
+    errno = ENOSYS;
+    return -1;
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+				      hwloc_const_nodeset_t nodeset,
+				      unsigned *max_os_index_p, unsigned long **linuxmaskp)
+{
+  unsigned max_os_index = 0; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  unsigned i;
+  hwloc_nodeset_t linux_nodeset = NULL;
+
+  if (hwloc_bitmap_isfull(nodeset)) {
+    linux_nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_only(linux_nodeset, 0);
+    nodeset = linux_nodeset;
+  }
+
+  max_os_index = hwloc_bitmap_last(nodeset);
+  if (max_os_index == (unsigned) -1)
+    max_os_index = 0;
+  /* add 1 to convert the last os_index into a max_os_index,
+   * and round up to the nearest multiple of BITS_PER_LONG */
+  max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
+
+  linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+  if (!linuxmask) {
+    hwloc_bitmap_free(linux_nodeset);
+    errno = ENOMEM;
+    return -1;
+  }
+
+  for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+    linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
+
+  if (linux_nodeset)
+    hwloc_bitmap_free(linux_nodeset);
+
+  *max_os_index_p = max_os_index;
+  *linuxmaskp = linuxmask;
+  return 0;
+}
+
+static void
+hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+				    hwloc_nodeset_t nodeset,
+				    unsigned max_os_index, const unsigned long *linuxmask)
+{
+  unsigned i;
+
+#ifdef HWLOC_DEBUG
+  /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
+  assert(!(max_os_index%HWLOC_BITS_PER_LONG));
+#endif
+
+  hwloc_bitmap_zero(nodeset);
+  for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+    hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
+}
+#endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_MBIND
+static int
+hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  unsigned max_os_index; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  size_t remainder;
+  int linuxpolicy;
+  unsigned linuxflags = 0;
+  int err;
+
+  remainder = (uintptr_t) addr & (sysconf(_SC_PAGESIZE)-1);
+  addr = (char*) addr - remainder;
+  len += remainder;
+
+  err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+  if (err < 0)
+    return err;
+
+  if (linuxpolicy == MPOL_DEFAULT)
+    /* Some Linux kernels don't like being passed a set */
+    return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
+
+  err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+  if (err < 0)
+    goto out;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef MPOL_MF_MOVE
+    linuxflags = MPOL_MF_MOVE;
+    if (flags & HWLOC_MEMBIND_STRICT)
+      linuxflags |= MPOL_MF_STRICT;
+#else
+    if (flags & HWLOC_MEMBIND_STRICT) {
+      errno = ENOSYS;
+      goto out_with_mask;
+    }
+#endif
+  }
+
+  err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+static void *
+hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *buffer;
+  int err;
+
+  buffer = hwloc_alloc_mmap(topology, len);
+  if (buffer == MAP_FAILED)
+    return NULL;
+
+  err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
+  if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
+    munmap(buffer, len);
+    return NULL;
+  }
+
+  return buffer;
+}
+#endif /* HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+static int
+hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  unsigned max_os_index; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  int linuxpolicy;
+  int err;
+
+  err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+  if (err < 0)
+    return err;
+
+  if (linuxpolicy == MPOL_DEFAULT)
+    /* Some Linux kernels don't like being passed a set */
+    return set_mempolicy(linuxpolicy, NULL, 0);
+
+  err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+  if (err < 0)
+    goto out;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef HWLOC_HAVE_MIGRATE_PAGES
+    unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+    if (fullmask) {
+      memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+      err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
+      free(fullmask);
+    } else
+      err = -1;
+    if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
+      goto out_with_mask;
+#else
+    errno = ENOSYS;
+    goto out_with_mask;
+#endif
+  }
+
+  err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+/*
+ * On some kernels, get_mempolicy requires the output size to be larger
+ * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
+ * Try get_mempolicy on ourself until we find a max_os_index value that
+ * makes the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
+{
+  static int max_numnodes = -1;
+  int linuxpolicy;
+
+  if (max_numnodes != -1)
+    /* already computed */
+    return max_numnodes;
+
+  /* start with a single ulong, it's the minimal and it's enough for most machines */
+  max_numnodes = HWLOC_BITS_PER_LONG;
+  while (1) {
+    unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
+    int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
+    free(mask);
+    if (!err || errno != EINVAL)
+      /* found it */
+      return max_numnodes;
+    max_numnodes *= 2;
+  }
+}
+
+static int
+hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
+{
+  switch (linuxpolicy) {
+  case MPOL_DEFAULT:
+    *policy = HWLOC_MEMBIND_FIRSTTOUCH;
+    return 0;
+  case MPOL_PREFERRED:
+  case MPOL_BIND:
+    *policy = HWLOC_MEMBIND_BIND;
+    return 0;
+  case MPOL_INTERLEAVE:
+    *policy = HWLOC_MEMBIND_INTERLEAVE;
+    return 0;
+  default:
+    errno = EINVAL;
+    return -1;
+  }
+}
+
+static int
+hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+  unsigned max_os_index;
+  unsigned long *linuxmask;
+  int linuxpolicy;
+  int err;
+
+  max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+  if (!linuxmask) {
+    errno = ENOMEM;
+    goto out;
+  }
+
+  err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
+  if (err < 0)
+    goto out_with_mask;
+
+  if (linuxpolicy == MPOL_DEFAULT) {
+    hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+  } else {
+    hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
+  }
+
+  err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+static int
+hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+  unsigned max_os_index;
+  unsigned long *linuxmask, *globallinuxmask;
+  int linuxpolicy, globallinuxpolicy = 0;
+  int mixed = 0;
+  int full = 0;
+  int first = 1;
+  int pagesize = hwloc_getpagesize();
+  char *tmpaddr;
+  int err;
+  unsigned i;
+
+  max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+  if (!linuxmask) {
+    errno = ENOMEM;
+    goto out;
+  }
+  globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+  if (!globallinuxmask) {
+    errno = ENOMEM;
+    goto out_with_masks;
+  }
+
+  for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
+      tmpaddr < (char *)addr + len;
+      tmpaddr += pagesize) {
+    err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
+    if (err < 0)
+      goto out_with_masks;
+
+    /* use the first found policy. if we find a different one later, set mixed to 1 */
+    if (first)
+      globallinuxpolicy = linuxpolicy;
+    else if (globallinuxpolicy != linuxpolicy)
+      mixed = 1;
+
+    /* agregate masks, and set full to 1 if we ever find DEFAULT */
+    if (full || linuxpolicy == MPOL_DEFAULT) {
+      full = 1;
+    } else {
+      for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+        globallinuxmask[i] |= linuxmask[i];
+    }
+
+    first = 0;
+  }
+
+  if (mixed) {
+    *policy = HWLOC_MEMBIND_MIXED;
+  } else {
+    err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+    if (err < 0)
+      goto out_with_masks;
+  }
+
+  if (full) {
+    hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+  } else {
+    hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
+  }
+
+  free(globallinuxmask);
+  free(linuxmask);
+  return 0;
+
+ out_with_masks:
+  free(globallinuxmask);
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+
+void
+hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
+			struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
+  hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
+  hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
+  hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+  hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+  hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+  hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
+  hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
+  hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+  hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
+  hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
+  hooks->get_area_membind = hwloc_linux_get_area_membind;
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+#ifdef HWLOC_HAVE_MBIND
+  hooks->set_area_membind = hwloc_linux_set_area_membind;
+  hooks->alloc_membind = hwloc_linux_alloc_membind;
+  hooks->alloc = hwloc_alloc_mmap;
+  hooks->free_membind = hwloc_free_mmap;
+  support->membind->firsttouch_membind = 1;
+  support->membind->bind_membind = 1;
+  support->membind->interleave_membind = 1;
+#endif /* HWLOC_HAVE_MBIND */
+#if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
+  support->membind->migrate_membind = 1;
+#endif
+}
+
+
+
+/*******************************************
+ *** Misc Helpers for Topology Discovery ***
+ *******************************************/
+
+/* cpuinfo array */
+struct hwloc_linux_cpuinfo_proc {
+  /* set during hwloc_linux_parse_cpuinfo */
+  unsigned long Pproc;
+  /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
+  long Pcore, Ppkg;
+  /* set later, or -1 if unknown */
+  long Lcore, Lpkg;
+
+  /* custom info, set during hwloc_linux_parse_cpuinfo */
+  struct hwloc_obj_info_s *infos;
+  unsigned infos_count;
+};
+
+static int
+hwloc_parse_sysfs_unsigned(const char *mappath, unsigned *value, int fsroot_fd)
+{
+  char string[11];
+  FILE * fd;
+
+  fd = hwloc_fopen(mappath, "r", fsroot_fd);
+  if (!fd) {
+    *value = -1;
+    return -1;
+  }
+
+  if (!fgets(string, 11, fd)) {
+    *value = -1;
+    fclose(fd);
+    return -1;
+  }
+  *value = strtoul(string, NULL, 10);
+
+  fclose(fd);
+
+  return 0;
+}
+
+
+/* kernel cpumaps are composed of an array of 32bits cpumasks */
+#define KERNEL_CPU_MASK_BITS 32
+#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
+
+int
+hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
+{
+  unsigned long *maps;
+  unsigned long map;
+  int nr_maps = 0;
+  static int nr_maps_allocated = 8; /* only compute the power-of-two above the kernel cpumask size once */
+  int i;
+
+  maps = malloc(nr_maps_allocated * sizeof(*maps));
+
+  /* reset to zero first */
+  hwloc_bitmap_zero(set);
+
+  /* parse the whole mask */
+  while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
+    {
+      if (nr_maps == nr_maps_allocated) {
+	nr_maps_allocated *= 2;
+	maps = realloc(maps, nr_maps_allocated * sizeof(*maps));
+      }
+
+      if (!map && !nr_maps)
+	/* ignore the first map if it's empty */
+	continue;
+
+      memmove(&maps[1], &maps[0], nr_maps*sizeof(*maps));
+      maps[0] = map;
+      nr_maps++;
+    }
+
+  /* convert into a set */
+#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
+  for(i=0; i<nr_maps; i++)
+    hwloc_bitmap_set_ith_ulong(set, i, maps[i]);
+#else
+  for(i=0; i<(nr_maps+1)/2; i++) {
+    unsigned long mask;
+    mask = maps[2*i];
+    if (2*i+1<nr_maps)
+      mask |= maps[2*i+1] << KERNEL_CPU_MASK_BITS;
+    hwloc_bitmap_set_ith_ulong(set, i, mask);
+  }
+#endif
+
+  free(maps);
+
+  return 0;
+}
+
+static hwloc_bitmap_t
+hwloc_parse_cpumap(const char *mappath, int fsroot_fd)
+{
+  hwloc_bitmap_t set;
+  FILE * file;
+
+  file = hwloc_fopen(mappath, "r", fsroot_fd);
+  if (!file)
+    return NULL;
+
+  set = hwloc_bitmap_alloc();
+  hwloc_linux_parse_cpumap_file(file, set);
+
+  fclose(file);
+  return set;
+}
+
+static char *
+hwloc_strdup_mntpath(const char *escapedpath, size_t length)
+{
+  char *path = malloc(length+1);
+  const char *src = escapedpath, *tmp;
+  char *dst = path;
+
+  while ((tmp = strchr(src, '\\')) != NULL) {
+    strncpy(dst, src, tmp-src);
+    dst += tmp-src;
+    if (!strncmp(tmp+1, "040", 3))
+      *dst = ' ';
+    else if (!strncmp(tmp+1, "011", 3))
+      *dst = '	';
+    else if (!strncmp(tmp+1, "012", 3))
+      *dst = '\n';
+    else
+      *dst = '\\';
+    dst++;
+    src = tmp+4;
+  }
+
+  strcpy(dst, src);
+
+  return path;
+}
+
+static void
+hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, int fsroot_fd)
+{
+#define PROC_MOUNT_LINE_LEN 512
+  char line[PROC_MOUNT_LINE_LEN];
+  FILE *fd;
+
+  *cgroup_mntpnt = NULL;
+  *cpuset_mntpnt = NULL;
+
+  /* ideally we should use setmntent, getmntent, hasmntopt and endmntent,
+   * but they do not support fsroot_fd.
+   */
+
+  fd = hwloc_fopen("/proc/mounts", "r", fsroot_fd);
+  if (!fd)
+    return;
+
+  while (fgets(line, sizeof(line), fd)) {
+    char *path;
+    char *type;
+    char *tmp;
+
+    /* remove the ending " 0 0\n" that the kernel always adds */
+    tmp = line + strlen(line) - 5;
+    if (tmp < line || strcmp(tmp, " 0 0\n"))
+      fprintf(stderr, "Unexpected end of /proc/mounts line `%s'\n", line);
+    else
+      *tmp = '\0';
+
+    /* path is after first field and a space */
+    tmp = strchr(line, ' ');
+    if (!tmp)
+      continue;
+    path = tmp+1;
+
+    /* type is after path, which may not contain spaces since the kernel escaped them to \040
+     * (see the manpage of getmntent) */
+    tmp = strchr(path, ' ');
+    if (!tmp)
+      continue;
+    type = tmp+1;
+    /* mark the end of path to ease upcoming strdup */
+    *tmp = '\0';
+
+    if (!strncmp(type, "cpuset ", 7)) {
+      /* found a cpuset mntpnt */
+      hwloc_debug("Found cpuset mount point on %s\n", path);
+      *cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      break;
+
+    } else if (!strncmp(type, "cgroup ", 7)) {
+      /* found a cgroup mntpnt */
+      char *opt, *opts;
+      int cpuset_opt = 0;
+      int noprefix_opt = 0;
+
+      /* find options */
+      tmp = strchr(type, ' ');
+      if (!tmp)
+	continue;
+      opts = tmp+1;
+
+      /* look at options */
+      while ((opt = strsep(&opts, ",")) != NULL) {
+	if (!strcmp(opt, "cpuset"))
+	  cpuset_opt = 1;
+	else if (!strcmp(opt, "noprefix"))
+	  noprefix_opt = 1;
+      }
+      if (!cpuset_opt)
+	continue;
+
+      if (noprefix_opt) {
+	hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", path);
+	*cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      } else {
+	hwloc_debug("Found cgroup/cpuset mount point on %s\n", path);
+	*cgroup_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      }
+      break;
+    }
+  }
+
+  fclose(fd);
+}
+
+/*
+ * Linux cpusets may be managed directly or through cgroup.
+ * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
+ * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
+ * containing <name>.
+ */
+static char *
+hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
+{
+#define CPUSET_NAME_LEN 128
+  char cpuset_name[CPUSET_NAME_LEN];
+  FILE *fd;
+  char *tmp;
+
+  /* check whether a cgroup-cpuset is enabled */
+  if (!pid)
+    fd = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
+  else {
+    char path[] = "/proc/XXXXXXXXXX/cgroup";
+    snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+    fd = hwloc_fopen(path, "r", fsroot_fd);
+  }
+  if (fd) {
+    /* find a cpuset line */
+#define CGROUP_LINE_LEN 256
+    char line[CGROUP_LINE_LEN];
+    while (fgets(line, sizeof(line), fd)) {
+      char *end, *colon = strchr(line, ':');
+      if (!colon)
+	continue;
+      if (strncmp(colon, ":cpuset:", 8))
+	continue;
+
+      /* found a cgroup-cpuset line, return the name */
+      fclose(fd);
+      end = strchr(colon, '\n');
+      if (end)
+	*end = '\0';
+      hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
+      return strdup(colon+8);
+    }
+    fclose(fd);
+  }
+
+  /* check whether a cpuset is enabled */
+  if (!pid)
+    fd = hwloc_fopen("/proc/self/cpuset", "r", fsroot_fd);
+  else {
+    char path[] = "/proc/XXXXXXXXXX/cpuset";
+    snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
+    fd = hwloc_fopen(path, "r", fsroot_fd);
+  }
+  if (!fd) {
+    /* found nothing */
+    hwloc_debug("%s", "No cgroup or cpuset found\n");
+    return NULL;
+  }
+
+  /* found a cpuset, return the name */
+  tmp = fgets(cpuset_name, sizeof(cpuset_name), fd);
+  fclose(fd);
+  if (!tmp)
+    return NULL;
+  tmp = strchr(cpuset_name, '\n');
+  if (tmp)
+    *tmp = '\0';
+  hwloc_debug("Found cpuset %s\n", cpuset_name);
+  return strdup(cpuset_name);
+}
+
+/*
+ * Then, the cpuset description is available from either the cgroup or
+ * the cpuset filesystem (usually mounted in / or /dev) where there
+ * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
+ */
+static char *
+hwloc_read_linux_cpuset_mask(const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name, const char *attr_name, int fsroot_fd)
+{
+#define CPUSET_FILENAME_LEN 256
+  char cpuset_filename[CPUSET_FILENAME_LEN];
+  FILE *fd;
+  char *info = NULL, *tmp;
+  ssize_t ssize;
+  size_t size;
+
+  if (cgroup_mntpnt) {
+    /* try to read the cpuset from cgroup */
+    snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
+    hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
+    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+    if (fd)
+      goto gotfile;
+  } else if (cpuset_mntpnt) {
+    /* try to read the cpuset directly */
+    snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
+    hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
+    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+    if (fd)
+      goto gotfile;
+  }
+
+  /* found no cpuset description, ignore it */
+  hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
+  goto out;
+
+gotfile:
+  ssize = getline(&info, &size, fd);
+  fclose(fd);
+  if (ssize < 0)
+    goto out;
+  if (!info)
+    goto out;
+
+  tmp = strchr(info, '\n');
+  if (tmp)
+    *tmp = '\0';
+
+out:
+  return info;
+}
+
+static void
+hwloc_admin_disable_set_from_cpuset(struct hwloc_linux_backend_data_s *data,
+				    const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
+				    const char *attr_name,
+				    hwloc_bitmap_t admin_enabled_cpus_set)
+{
+  char *cpuset_mask;
+  char *current, *comma, *tmp;
+  int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
+  hwloc_bitmap_t tmpset;
+
+  cpuset_mask = hwloc_read_linux_cpuset_mask(cgroup_mntpnt, cpuset_mntpnt, cpuset_name,
+					     attr_name, data->root_fd);
+  if (!cpuset_mask)
+    return;
+
+  hwloc_debug("found cpuset %s: %s\n", attr_name, cpuset_mask);
+
+  current = cpuset_mask;
+  prevlast = -1;
+
+  while (1) {
+    /* save a pointer to the next comma and erase it to simplify things */
+    comma = strchr(current, ',');
+    if (comma)
+      *comma = '\0';
+
+    /* find current enabled-segment bounds */
+    nextfirst = strtoul(current, &tmp, 0);
+    if (*tmp == '-')
+      nextlast = strtoul(tmp+1, NULL, 0);
+    else
+      nextlast = nextfirst;
+    if (prevlast+1 <= nextfirst-1) {
+      hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+      hwloc_bitmap_clr_range(admin_enabled_cpus_set, prevlast+1, nextfirst-1);
+    }
+
+    /* switch to next enabled-segment */
+    prevlast = nextlast;
+    if (!comma)
+      break;
+    current = comma+1;
+  }
+
+  hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+  /* no easy way to clear until the infinity */
+  tmpset = hwloc_bitmap_alloc();
+  hwloc_bitmap_set_range(tmpset, 0, prevlast);
+  hwloc_bitmap_and(admin_enabled_cpus_set, admin_enabled_cpus_set, tmpset);
+  hwloc_bitmap_free(tmpset);
+
+  free(cpuset_mask);
+}
+
+static void
+hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
+			 const char *path,
+			 int prefixlength,
+			 uint64_t *local_memory,
+			 uint64_t *meminfo_hugepages_count,
+			 uint64_t *meminfo_hugepages_size,
+			 int onlytotal)
+{
+  char string[64];
+  FILE *fd;
+
+  fd = hwloc_fopen(path, "r", data->root_fd);
+  if (!fd)
+    return;
+
+  while (fgets(string, sizeof(string), fd) && *string != '\0')
+    {
+      unsigned long long number;
+      if (strlen(string) < (size_t) prefixlength)
+        continue;
+      if (sscanf(string+prefixlength, "MemTotal: %llu kB", (unsigned long long *) &number) == 1) {
+	*local_memory = number << 10;
+	if (onlytotal)
+	  break;
+      }
+      else if (!onlytotal) {
+	if (sscanf(string+prefixlength, "Hugepagesize: %llu", (unsigned long long *) &number) == 1)
+	  *meminfo_hugepages_size = number << 10;
+	else if (sscanf(string+prefixlength, "HugePages_Free: %llu", (unsigned long long *) &number) == 1)
+          /* these are free hugepages, not the total amount of huge pages */
+	  *meminfo_hugepages_count = number;
+      }
+    }
+
+  fclose(fd);
+}
+
+#define SYSFS_NUMA_NODE_PATH_LEN 128
+
+static void
+hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
+			   const char *dirpath,
+			   struct hwloc_obj_memory_s *memory,
+			   uint64_t *remaining_local_memory)
+{
+  DIR *dir;
+  struct dirent *dirent;
+  unsigned long index_ = 1;
+  FILE *hpfd;
+  char line[64];
+  char path[SYSFS_NUMA_NODE_PATH_LEN];
+
+  dir = hwloc_opendir(dirpath, data->root_fd);
+  if (dir) {
+    while ((dirent = readdir(dir)) != NULL) {
+      if (strncmp(dirent->d_name, "hugepages-", 10))
+        continue;
+      memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
+      sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
+      hpfd = hwloc_fopen(path, "r", data->root_fd);
+      if (hpfd) {
+        if (fgets(line, sizeof(line), hpfd)) {
+          /* these are the actual total amount of huge pages */
+          memory->page_types[index_].count = strtoull(line, NULL, 0);
+          *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
+          index_++;
+        }
+	fclose(hpfd);
+      }
+    }
+    closedir(dir);
+    memory->page_types_len = index_;
+  }
+}
+
+static void
+hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
+			      struct hwloc_linux_backend_data_s *data,
+			      struct hwloc_obj_memory_s *memory)
+{
+  uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
+  struct stat st;
+  int has_sysfs_hugepages = 0;
+  const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
+  int types = 2;
+  int err;
+
+  err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
+  if (!err) {
+    types = 1 + st.st_nlink-2;
+    has_sysfs_hugepages = 1;
+  }
+
+  if (topology->is_thissystem || pagesize_env) {
+    /* we cannot report any page_type info unless we have the page size.
+     * we'll take it either from the system if local, or from the debug env variable
+     */
+    memory->page_types_len = types;
+    memory->page_types = calloc(types, sizeof(*memory->page_types));
+  }
+
+  if (topology->is_thissystem) {
+    /* Get the page and hugepage sizes from sysconf */
+#ifdef HAVE__SC_LARGE_PAGESIZE
+    memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+    memory->page_types[0].size = hwloc_getpagesize(); /* might be overwritten later by /proc/meminfo or sysfs */
+  }
+
+  hwloc_parse_meminfo_info(data, "/proc/meminfo", 0 /* no prefix */,
+			   &memory->local_memory,
+			   &meminfo_hugepages_count, &meminfo_hugepages_size,
+			   memory->page_types == NULL);
+
+  if (memory->page_types) {
+    uint64_t remaining_local_memory = memory->local_memory;
+    if (has_sysfs_hugepages) {
+      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+      hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
+    } else {
+      /* use what we found in meminfo */
+      if (meminfo_hugepages_size) {
+        memory->page_types[1].size = meminfo_hugepages_size;
+        memory->page_types[1].count = meminfo_hugepages_count;
+        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+      } else {
+        memory->page_types_len = 1;
+      }
+    }
+
+    if (pagesize_env) {
+      /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
+      memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
+      /* If failed, use 4kB */
+      if (!memory->page_types[0].size)
+	memory->page_types[0].size = 4096;
+    }
+    assert(memory->page_types[0].size); /* from sysconf if local or from the env */
+    /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
+     * may be 0 if no hugepage support in the kernel */
+
+    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+  }
+}
+
+static void
+hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
+			      struct hwloc_linux_backend_data_s *data,
+			      const char *syspath, int node,
+			      struct hwloc_obj_memory_s *memory)
+{
+  char path[SYSFS_NUMA_NODE_PATH_LEN];
+  char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
+  uint64_t meminfo_hugepages_count = 0;
+  uint64_t meminfo_hugepages_size = 0;
+  struct stat st;
+  int has_sysfs_hugepages = 0;
+  int types = 2;
+  int err;
+
+  sprintf(path, "%s/node%d/hugepages", syspath, node);
+  err = hwloc_stat(path, &st, data->root_fd);
+  if (!err) {
+    types = 1 + st.st_nlink-2;
+    has_sysfs_hugepages = 1;
+  }
+
+  if (topology->is_thissystem) {
+    memory->page_types_len = types;
+    memory->page_types = malloc(types*sizeof(*memory->page_types));
+    memset(memory->page_types, 0, types*sizeof(*memory->page_types));
+  }
+
+  sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
+  hwloc_parse_meminfo_info(data, meminfopath,
+			   snprintf(NULL, 0, "Node %d ", node),
+			   &memory->local_memory,
+			   &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
+			   memory->page_types == NULL);
+
+  if (memory->page_types) {
+    uint64_t remaining_local_memory = memory->local_memory;
+    if (has_sysfs_hugepages) {
+      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+      hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
+    } else {
+      /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
+       * hwloc_get_procfs_meminfo_info must have been called earlier */
+      meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
+      /* use what we found in meminfo */
+      if (meminfo_hugepages_size) {
+        memory->page_types[1].count = meminfo_hugepages_count;
+        memory->page_types[1].size = meminfo_hugepages_size;
+        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+      } else {
+        memory->page_types_len = 1;
+      }
+    }
+    /* update what's remaining as normal pages */
+    memory->page_types[0].size = hwloc_getpagesize();
+    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+  }
+}
+
+static void
+hwloc_parse_node_distance(const char *distancepath, unsigned nbnodes, float *distances, int fsroot_fd)
+{
+  char string[4096]; /* enough for hundreds of nodes */
+  char *tmp, *next;
+  FILE * fd;
+
+  fd = hwloc_fopen(distancepath, "r", fsroot_fd);
+  if (!fd)
+    return;
+
+  if (!fgets(string, sizeof(string), fd)) {
+    fclose(fd);
+    return;
+  }
+
+  tmp = string;
+  while (tmp) {
+    unsigned distance = strtoul(tmp, &next, 0);
+    if (next == tmp)
+      break;
+    *distances = (float) distance;
+    distances++;
+    nbnodes--;
+    if (!nbnodes)
+      break;
+    tmp = next+1;
+  }
+
+  fclose(fd);
+}
+
+static void
+hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
+			   hwloc_obj_t obj,
+			   char *path, unsigned pathlen,
+			   const char *dmi_name, const char *hwloc_name)
+{
+  char dmi_line[64];
+  char *tmp;
+  FILE *fd;
+
+  strcpy(path+pathlen, dmi_name);
+  fd = hwloc_fopen(path, "r", data->root_fd);
+  if (!fd)
+    return;
+
+  dmi_line[0] = '\0';
+  tmp = fgets(dmi_line, sizeof(dmi_line), fd);
+  fclose (fd);
+
+  if (tmp && dmi_line[0] != '\0') {
+    tmp = strchr(dmi_line, '\n');
+    if (tmp)
+      *tmp = '\0';
+    hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
+    hwloc_obj_add_info(obj, hwloc_name, dmi_line);
+  }
+}
+
+static void
+hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
+{
+  char path[128];
+  unsigned pathlen;
+  DIR *dir;
+
+  strcpy(path, "/sys/devices/virtual/dmi/id");
+  dir = hwloc_opendir(path, data->root_fd);
+  if (dir) {
+    pathlen = 27;
+  } else {
+    strcpy(path, "/sys/class/dmi/id");
+    dir = hwloc_opendir(path, data->root_fd);
+    if (dir)
+      pathlen = 17;
+    else
+      return;
+  }
+  closedir(dir);
+
+  path[pathlen++] = '/';
+
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
+}
+
+struct hwloc_firmware_dmi_mem_device_header {
+  unsigned char type;
+  unsigned char length;
+  unsigned char handle[2];
+  unsigned char phy_mem_handle[2];
+  unsigned char mem_err_handle[2];
+  unsigned char tot_width[2];
+  unsigned char dat_width[2];
+  unsigned char size[2];
+  unsigned char ff;
+  unsigned char dev_set;
+  unsigned char dev_loc_str_num;
+  unsigned char bank_loc_str_num;
+  unsigned char mem_type;
+  unsigned char type_detail[2];
+  unsigned char speed[2];
+  unsigned char manuf_str_num;
+  unsigned char serial_str_num;
+  unsigned char asset_tag_str_num;
+  unsigned char part_num_str_num;
+  /* don't include the following fields since we don't need them,
+   * some old implementations may miss them.
+   */
+};
+
+static int check_dmi_entry(const char *buffer)
+{
+  /* reject empty strings */
+  if (!*buffer)
+    return 0;
+  /* reject strings of spaces (at least Dell use this for empty memory slots) */
+  if (strspn(buffer, " ") == strlen(buffer))
+    return 0;
+  return 1;
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
+					unsigned idx, const char *path, FILE *fd,
+					struct hwloc_firmware_dmi_mem_device_header *header)
+{
+  unsigned slen;
+  char buffer[256]; /* enough for memory device strings, or at least for each of them */
+  unsigned foff; /* offset in raw file */
+  unsigned boff; /* offset in buffer read from raw file */
+  unsigned i;
+  struct hwloc_obj_info_s *infos = NULL;
+  unsigned infos_count = 0;
+  hwloc_obj_t misc;
+  int foundinfo = 0;
+
+  hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
+
+  /* start after the header */
+  foff = header->length;
+  i = 1;
+  while (1) {
+    /* read one buffer */
+    if (fseek(fd, foff, SEEK_SET) < 0)
+      break;
+    if (!fgets(buffer, sizeof(buffer), fd))
+      break;
+    /* read string at the beginning of the buffer */
+    boff = 0;
+    while (1) {
+      /* stop on empty string */
+      if (!buffer[boff])
+        goto done;
+      /* stop if this string goes to the end of the buffer */
+      slen = strlen(buffer+boff);
+      if (boff + slen+1 == sizeof(buffer))
+        break;
+      /* string didn't get truncated, should be OK */
+      if (i == header->manuf_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
+	  foundinfo = 1;
+	}
+      }	else if (i == header->serial_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->asset_tag_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->part_num_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->dev_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
+	}
+      } else if (i == header->bank_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
+	}
+      } else {
+	goto done;
+      }
+      /* next string in buffer */
+      boff += slen+1;
+      i++;
+    }
+    /* couldn't read a single full string from that buffer, we're screwed */
+    if (!boff) {
+      fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
+	      i, path);
+      break;
+    }
+    /* reread buffer after previous string */
+    foff += boff;
+  }
+
+done:
+  if (!foundinfo) {
+    /* found no actual info about the device. if there's only location info, the slot may be empty */
+    goto out_with_infos;
+  }
+
+  misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
+  if (!misc)
+    goto out_with_infos;
+
+  hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
+  /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
+   * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
+   * with the vendor, and it's hard to be 100% sure 'B' is second socket.
+   * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
+   * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
+   */
+  hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
+  return;
+
+ out_with_infos:
+  hwloc__free_infos(infos, infos_count);
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
+				    struct hwloc_linux_backend_data_s *data)
+{
+  char path[128];
+  unsigned i;
+
+  for(i=0; ; i++) {
+    FILE *fd;
+    struct hwloc_firmware_dmi_mem_device_header header;
+    int err;
+
+    snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
+    fd = hwloc_fopen(path, "r", data->root_fd);
+    if (!fd)
+      break;
+
+    err = fread(&header, sizeof(header), 1, fd);
+    if (err != 1)
+      break;
+    if (header.length < sizeof(header)) {
+      /* invalid, or too old entry/spec that doesn't contain what we need */
+      fclose(fd);
+      break;
+    }
+
+    hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
+
+    fclose(fd);
+  }
+}
+
+
+/***********************************
+ ****** Device tree Discovery ******
+ ***********************************/
+
+/* Reads the entire file and returns bytes read if bytes_read != NULL
+ * Returned pointer can be freed by using free().  */
+static void *
+hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
+{
+  char fname[256];
+  char *ret = NULL;
+  struct stat fs;
+  int file = -1;
+
+  snprintf(fname, sizeof(fname), "%s/%s", p, p1);
+
+  file = hwloc_open(fname, root_fd);
+  if (-1 == file) {
+      goto out_no_close;
+  }
+  if (fstat(file, &fs)) {
+    goto out;
+  }
+
+  ret = (char *) malloc(fs.st_size);
+  if (NULL != ret) {
+    ssize_t cb = read(file, ret, fs.st_size);
+    if (cb == -1) {
+      free(ret);
+      ret = NULL;
+    } else {
+      if (NULL != bytes_read)
+        *bytes_read = cb;
+    }
+  }
+
+ out:
+  close(file);
+ out_no_close:
+  return ret;
+}
+
+/* Reads the entire file and returns it as a 0-terminated string
+ * Returned pointer can be freed by using free().  */
+static char *
+hwloc_read_str(const char *p, const char *p1, int root_fd)
+{
+  size_t cb = 0;
+  char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
+  if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
+    ret = realloc(ret, cb + 1);
+    ret[cb] = 0;
+  }
+  return ret;
+}
+
+/* Reads first 32bit bigendian value */
+static ssize_t
+hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
+{
+  size_t cb = 0;
+  uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
+  if (sizeof(*buf) != cb) {
+    errno = EINVAL;
+    free(tmp); /* tmp is either NULL or contains useless things */
+    return -1;
+  }
+  *buf = htonl(*tmp);
+  free(tmp);
+  return sizeof(*buf);
+}
+
+typedef struct {
+  unsigned int n, allocated;
+  struct {
+    hwloc_bitmap_t cpuset;
+    uint32_t phandle;
+    uint32_t l2_cache;
+    char *name;
+  } *p;
+} device_tree_cpus_t;
+
+static void
+add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
+    uint32_t l2_cache, uint32_t phandle, const char *name)
+{
+  if (cpus->n == cpus->allocated) {
+    if (!cpus->allocated)
+      cpus->allocated = 64;
+    else
+      cpus->allocated *= 2;
+    cpus->p = realloc(cpus->p, cpus->allocated * sizeof(cpus->p[0]));
+  }
+  cpus->p[cpus->n].phandle = phandle;
+  cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
+  cpus->p[cpus->n].l2_cache = l2_cache;
+  cpus->p[cpus->n].name = strdup(name);
+  ++cpus->n;
+}
+
+/* Walks over the cache list in order to detect nested caches and CPU mask for each */
+static int
+look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
+    uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
+{
+  unsigned int i;
+  int ret = -1;
+  if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
+    return ret;
+  for (i = 0; i < cpus->n; ++i) {
+    if (phandle != cpus->p[i].l2_cache)
+      continue;
+    if (NULL != cpus->p[i].cpuset) {
+      hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
+      ret = 0;
+    } else {
+      ++(*level);
+      if (0 == look_powerpc_device_tree_discover_cache(cpus,
+            cpus->p[i].phandle, level, cpuset))
+        ret = 0;
+    }
+  }
+  return ret;
+}
+
+static void
+try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+				    unsigned int level, hwloc_obj_cache_type_t type,
+				    uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
+				    hwloc_bitmap_t cpuset)
+{
+  struct hwloc_obj *c = NULL;
+
+  if (0 == cache_size)
+    return;
+
+  c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+  c->attr->cache.depth = level;
+  c->attr->cache.linesize = cache_line_size;
+  c->attr->cache.size = cache_size;
+  c->attr->cache.type = type;
+  if (cache_sets == 1)
+    /* likely wrong, make it unknown */
+    cache_sets = 0;
+  if (cache_sets && cache_line_size)
+    c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
+  else
+    c->attr->cache.associativity = 0;
+  c->cpuset = hwloc_bitmap_dup(cpuset);
+  hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
+			   type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
+			   level, c->cpuset);
+  hwloc_insert_object_by_cpuset(topology, c);
+}
+
+static void
+try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+				   struct hwloc_linux_backend_data_s *data,
+				   const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
+{
+  /* d-cache-block-size - ignore */
+  /* d-cache-line-size - to read, in bytes */
+  /* d-cache-sets - ignore */
+  /* d-cache-size - to read, in bytes */
+  /* i-cache, same for instruction */
+  /* cache-unified only exist if data and instruction caches are unified */
+  /* d-tlb-sets - ignore */
+  /* d-tlb-size - ignore, always 0 on power6 */
+  /* i-tlb-*, same */
+  uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
+  uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
+  char unified_path[1024];
+  struct stat statbuf;
+  int unified;
+
+  snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
+  unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
+
+  hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
+      data->root_fd);
+
+  if (!unified)
+    try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
+					i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
+  try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
+				      d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
+}
+
+/*
+ * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
+ * which provide NUMA nodes information without any details
+ */
+static void
+look_powerpc_device_tree(struct hwloc_topology *topology,
+			 struct hwloc_linux_backend_data_s *data)
+{
+  device_tree_cpus_t cpus;
+  const char ofroot[] = "/proc/device-tree/cpus";
+  unsigned int i;
+  int root_fd = data->root_fd;
+  DIR *dt = hwloc_opendir(ofroot, root_fd);
+  struct dirent *dirent;
+
+  if (NULL == dt)
+    return;
+
+  /* only works for Power so far, and not useful on ARM */
+  if (strncmp(data->utsname.machine, "ppc", 3))
+    return;
+
+  cpus.n = 0;
+  cpus.p = NULL;
+  cpus.allocated = 0;
+
+  while (NULL != (dirent = readdir(dt))) {
+    char cpu[256];
+    char *device_type;
+    uint32_t reg = -1, l2_cache = -1, phandle = -1;
+
+    if ('.' == dirent->d_name[0])
+      continue;
+
+    snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
+
+    device_type = hwloc_read_str(cpu, "device_type", root_fd);
+    if (NULL == device_type)
+      continue;
+
+    hwloc_read_unit32be(cpu, "reg", &reg, root_fd);
+    if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
+      hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
+    if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
+      if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
+        hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
+
+    if (0 == strcmp(device_type, "cache")) {
+      add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
+    }
+    else if (0 == strcmp(device_type, "cpu")) {
+      /* Found CPU */
+      hwloc_bitmap_t cpuset = NULL;
+      size_t cb = 0;
+      uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
+      uint32_t nthreads = cb / sizeof(threads[0]);
+
+      if (NULL != threads) {
+        cpuset = hwloc_bitmap_alloc();
+        for (i = 0; i < nthreads; ++i) {
+          if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
+            hwloc_bitmap_set(cpuset, ntohl(threads[i]));
+        }
+        free(threads);
+      } else if ((unsigned int)-1 != reg) {
+        /* Doesn't work on ARM because cpu "reg" do not start at 0.
+	 * We know the first cpu "reg" is the lowest. The others are likely
+	 * in order assuming the device-tree shows objects in order.
+	 */
+        cpuset = hwloc_bitmap_alloc();
+        hwloc_bitmap_set(cpuset, reg);
+      }
+
+      if (NULL == cpuset) {
+        hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
+      } else {
+        struct hwloc_obj *core = NULL;
+        add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
+
+        /* Add core */
+        core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
+        core->cpuset = hwloc_bitmap_dup(cpuset);
+        hwloc_insert_object_by_cpuset(topology, core);
+
+        /* Add L1 cache */
+        try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
+
+        hwloc_bitmap_free(cpuset);
+      }
+    }
+    free(device_type);
+  }
+  closedir(dt);
+
+  /* No cores and L2 cache were found, exiting */
+  if (0 == cpus.n) {
+    hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
+    return;
+  }
+
+#ifdef HWLOC_DEBUG
+  for (i = 0; i < cpus.n; ++i) {
+    hwloc_debug("%i: %s  ibm,phandle=%08X l2_cache=%08X ",
+      i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
+    if (NULL == cpus.p[i].cpuset) {
+      hwloc_debug("%s\n", "no cpuset");
+    } else {
+      hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
+    }
+  }
+#endif
+
+  /* Scan L2/L3/... caches */
+  for (i = 0; i < cpus.n; ++i) {
+    unsigned int level = 2;
+    hwloc_bitmap_t cpuset;
+    /* Skip real CPUs */
+    if (NULL != cpus.p[i].cpuset)
+      continue;
+
+    /* Calculate cache level and CPU mask */
+    cpuset = hwloc_bitmap_alloc();
+    if (0 == look_powerpc_device_tree_discover_cache(&cpus,
+          cpus.p[i].phandle, &level, cpuset)) {
+      char cpu[256];
+      snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
+      try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
+    }
+    hwloc_bitmap_free(cpuset);
+  }
+
+  /* Do cleanup */
+  for (i = 0; i < cpus.n; ++i) {
+    hwloc_bitmap_free(cpus.p[i].cpuset);
+    free(cpus.p[i].name);
+  }
+  free(cpus.p);
+}
+
+
+
+/**************************************
+ ****** Sysfs Topology Discovery ******
+ **************************************/
+
+static int
+look_sysfsnode(struct hwloc_topology *topology,
+	       struct hwloc_linux_backend_data_s *data,
+	       const char *path, unsigned *found)
+{
+  unsigned osnode;
+  unsigned nbnodes = 0;
+  DIR *dir;
+  struct dirent *dirent;
+  hwloc_bitmap_t nodeset;
+
+  *found = 0;
+
+  /* Get the list of nodes first */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (dir)
+    {
+      nodeset = hwloc_bitmap_alloc();
+      while ((dirent = readdir(dir)) != NULL)
+	{
+	  if (strncmp(dirent->d_name, "node", 4))
+	    continue;
+	  osnode = strtoul(dirent->d_name+4, NULL, 0);
+	  hwloc_bitmap_set(nodeset, osnode);
+	  nbnodes++;
+	}
+      closedir(dir);
+    }
+  else
+    return -1;
+
+  if (nbnodes <= 1)
+    {
+      hwloc_bitmap_free(nodeset);
+      return 0;
+    }
+
+  /* For convenience, put these declarations inside a block. */
+
+  {
+      hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+      unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+      float * distances;
+      int failednodes = 0;
+      unsigned index_;
+
+      if (NULL == nodes || NULL == indexes) {
+          free(nodes);
+          free(indexes);
+          hwloc_bitmap_free(nodeset);
+          nbnodes = 0;
+          goto out;
+      }
+
+      /* Unsparsify node indexes.
+       * We'll need them later because Linux groups sparse distances
+       * and keeps them in order in the sysfs distance files.
+       * It'll simplify things in the meantime.
+       */
+      index_ = 0;
+      hwloc_bitmap_foreach_begin (osnode, nodeset) {
+	indexes[index_] = osnode;
+	index_++;
+      } hwloc_bitmap_foreach_end();
+      hwloc_bitmap_free(nodeset);
+
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "NUMA indexes: ");
+      for (index_ = 0; index_ < nbnodes; index_++) {
+	hwloc_debug(" %u", indexes[index_]);
+      }
+      hwloc_debug("%s", "\n");
+#endif
+
+      /* Create NUMA objects */
+      for (index_ = 0; index_ < nbnodes; index_++) {
+          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+          hwloc_bitmap_t cpuset;
+          hwloc_obj_t node, res_obj;
+
+	  osnode = indexes[index_];
+
+          sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
+          cpuset = hwloc_parse_cpumap(nodepath, data->root_fd);
+          if (!cpuset) {
+	    /* This NUMA object won't be inserted, we'll ignore distances */
+	    failednodes++;
+	    continue;
+	  }
+
+          node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
+          node->cpuset = cpuset;
+          node->nodeset = hwloc_bitmap_alloc();
+          hwloc_bitmap_set(node->nodeset, osnode);
+
+          hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
+
+          hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+                                  osnode, node->cpuset);
+          res_obj = hwloc_insert_object_by_cpuset(topology, node);
+	  if (node == res_obj) {
+	    nodes[index_] = node;
+	  } else {
+	    /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
+	     * This object disappeared, we'll ignore distances */
+	    failednodes++;
+	  }
+      }
+
+      if (failednodes) {
+	/* failed to read/create some nodes, don't bother reading/fixing
+	 * a distance matrix that would likely be wrong anyway.
+	 */
+	nbnodes -= failednodes;
+	distances = NULL;
+      } else {
+	distances = calloc(nbnodes*nbnodes, sizeof(float));
+      }
+
+      if (NULL == distances) {
+          free(nodes);
+          free(indexes);
+          goto out;
+      }
+
+      /* Get actual distances now */
+      for (index_ = 0; index_ < nbnodes; index_++) {
+          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+
+	  osnode = indexes[index_];
+
+	  /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
+	   * store them in slots X*N...X*N+N-1 */
+          sprintf(nodepath, "%s/node%u/distance", path, osnode);
+          hwloc_parse_node_distance(nodepath, nbnodes, distances+index_*nbnodes, data->root_fd);
+      }
+
+      hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+  }
+
+ out:
+  *found = nbnodes;
+  return 0;
+}
+
+/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
+static int
+look_sysfscpu(struct hwloc_topology *topology,
+	      struct hwloc_linux_backend_data_s *data,
+	      const char *path,
+	      struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
+{
+  hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
+  hwloc_bitmap_t unknownset; /* Set of cpus to clear */
+#define CPU_TOPOLOGY_STR_LEN 128
+  char str[CPU_TOPOLOGY_STR_LEN];
+  DIR *dir;
+  int i,j;
+  FILE *fd;
+  unsigned caches_added, merge_buggy_core_siblings;
+  hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
+
+  /* fill the cpuset of interesting cpus */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (!dir)
+    return -1;
+  else {
+    struct dirent *dirent;
+    cpuset = hwloc_bitmap_alloc();
+    unknownset = hwloc_bitmap_alloc();
+
+    while ((dirent = readdir(dir)) != NULL) {
+      unsigned long cpu;
+      char online[2];
+
+      if (strncmp(dirent->d_name, "cpu", 3))
+	continue;
+      cpu = strtoul(dirent->d_name+3, NULL, 0);
+
+      /* Maybe we don't have topology information but at least it exists */
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
+
+      /* check whether this processor is online */
+      sprintf(str, "%s/cpu%lu/online", path, cpu);
+      fd = hwloc_fopen(str, "r", data->root_fd);
+      if (fd) {
+	if (fgets(online, sizeof(online), fd)) {
+	  if (!atoi(online)) {
+	    fclose(fd);
+	    hwloc_debug("os proc %lu is offline\n", cpu);
+	    hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+	    hwloc_bitmap_set(unknownset, cpu);
+	    continue;
+	  }
+	}
+	fclose(fd);
+      }
+
+      /* check whether the kernel exports topology information for this cpu */
+      sprintf(str, "%s/cpu%lu/topology", path, cpu);
+      if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
+	hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
+		   cpu, path, cpu);
+	hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+	hwloc_bitmap_set(unknownset, cpu);
+	continue;
+      }
+
+      hwloc_bitmap_set(cpuset, cpu);
+    }
+    closedir(dir);
+  }
+
+  topology->support.discovery->pu = 1;
+  hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
+	     hwloc_bitmap_weight(cpuset), cpuset);
+
+  merge_buggy_core_siblings = (!strcmp(data->utsname.machine, "x86_64"))
+			   || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"));
+  caches_added = 0;
+  hwloc_bitmap_foreach_begin(i, cpuset)
+    {
+      hwloc_bitmap_t packageset, coreset, bookset, threadset, savedcoreset;
+      unsigned mypackageid, mycoreid, mybookid;
+      int threadwithcoreid = 0;
+
+      /* look at the package */
+      mypackageid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i);
+      hwloc_parse_sysfs_unsigned(str, &mypackageid, data->root_fd);
+
+      sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
+      packageset = hwloc_parse_cpumap(str, data->root_fd);
+      if (packageset) {
+       hwloc_bitmap_andnot(packageset, packageset, unknownset);
+       if (hwloc_bitmap_first(packageset) == i) {
+        /* first cpu in this package, add the package */
+	struct hwloc_obj *package;
+
+	if (merge_buggy_core_siblings) {
+	  /* check for another package with same physical_package_id */
+	  hwloc_obj_t curpackage = packages;
+	  while (curpackage) {
+	    if (curpackage->os_index == mypackageid) {
+	      /* found another package with same physical_package_id but different core_siblings.
+	       * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
+	       * merge these core_siblings to extend the existing first package object.
+	       */
+	      static int reported = 0;
+	      if (!reported && !hwloc_hide_errors()) {
+		char *a, *b;
+		hwloc_bitmap_asprintf(&a, curpackage->cpuset);
+		hwloc_bitmap_asprintf(&b, packageset);
+		fprintf(stderr, "****************************************************************************\n");
+		fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
+		fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
+			mypackageid, a, b);
+		fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
+		fprintf(stderr, "* does not support this processor correctly.\n");
+		fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
+	        fprintf(stderr, "*\n");
+		fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
+		fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
+		fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+		fprintf(stderr, "****************************************************************************\n");
+		reported = 1;
+		free(a);
+		free(b);
+	      }
+	      hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
+	      goto package_done;
+	    }
+	    curpackage = curpackage->next_cousin;
+	  }
+	}
+
+	/* no package with same physical_package_id, create a new one */
+	package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
+	package->cpuset = packageset;
+	hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+				mypackageid, packageset);
+	/* add cpuinfo */
+	if (cpuinfo_Lprocs) {
+	  for(j=0; j<(int) cpuinfo_numprocs; j++)
+	    if ((int) cpuinfo_Lprocs[j].Pproc == i) {
+	      hwloc__move_infos(&package->infos, &package->infos_count,
+				&cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
+	    }
+	}
+	/* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
+	 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
+	 */
+	package->next_cousin = packages;
+	packages = package;
+
+	packageset = NULL; /* don't free it */
+       }
+      }
+package_done:
+      hwloc_bitmap_free(packageset);
+
+      /* look at the core */
+      mycoreid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/core_id", path, i);
+      hwloc_parse_sysfs_unsigned(str, &mycoreid, data->root_fd);
+
+      sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
+      coreset = hwloc_parse_cpumap(str, data->root_fd);
+      savedcoreset = coreset; /* store it for later work-arounds */
+      if (coreset) {
+       hwloc_bitmap_andnot(coreset, coreset, unknownset);
+       if (hwloc_bitmap_weight(coreset) > 1) {
+	/* check if this is hyper-threading or different coreids */
+	unsigned siblingid, siblingcoreid;
+	hwloc_bitmap_t set = hwloc_bitmap_dup(coreset);
+	hwloc_bitmap_clr(set, i);
+	siblingid = hwloc_bitmap_first(set);
+	siblingcoreid = mycoreid;
+	sprintf(str, "%s/cpu%d/topology/core_id", path, siblingid);
+	hwloc_parse_sysfs_unsigned(str, &siblingcoreid, data->root_fd);
+	threadwithcoreid = (siblingcoreid != mycoreid);
+	hwloc_bitmap_free(set);
+       }
+       if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
+	/* regular core */
+        struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
+	if (threadwithcoreid) {
+	  /* amd multicore compute-unit, create one core per thread */
+	  core->cpuset = hwloc_bitmap_alloc();
+	  hwloc_bitmap_set(core->cpuset, i);
+	} else {
+	  core->cpuset = coreset;
+	}
+        hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+                     mycoreid, coreset);
+        hwloc_insert_object_by_cpuset(topology, core);
+        coreset = NULL; /* don't free it */
+       }
+      }
+
+      /* look at the books */
+      mybookid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/book_id", path, i);
+      if (hwloc_parse_sysfs_unsigned(str, &mybookid, data->root_fd) == 0) {
+        sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
+        bookset = hwloc_parse_cpumap(str, data->root_fd);
+	if (bookset) {
+	 hwloc_bitmap_andnot(bookset, bookset, unknownset);
+         if (bookset && hwloc_bitmap_first(bookset) == i) {
+          struct hwloc_obj *book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
+          book->cpuset = bookset;
+          hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
+                       mybookid, bookset);
+          hwloc_obj_add_info(book, "Type", "Book");
+          hwloc_insert_object_by_cpuset(topology, book);
+          bookset = NULL; /* don't free it */
+	 }
+        }
+      }
+
+      {
+      /* look at the thread */
+      struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
+      threadset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(threadset, i);
+      thread->cpuset = threadset;
+      hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
+		 i, threadset);
+      hwloc_insert_object_by_cpuset(topology, thread);
+      }
+
+      /* look at the caches */
+      for(j=0; j<10; j++) {
+#define SHARED_CPU_MAP_STRLEN 128
+	char mappath[SHARED_CPU_MAP_STRLEN];
+	char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
+	hwloc_bitmap_t cacheset;
+	unsigned long kB = 0;
+	unsigned linesize = 0;
+	unsigned sets = 0, lines_per_tag = 1;
+	int depth; /* 0 for L1, .... */
+	hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
+
+	/* get the cache level depth */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/level", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  char *res = fgets(str2,sizeof(str2), fd);
+	  fclose(fd);
+	  if (res)
+	    depth = strtoul(str2, NULL, 10)-1;
+	  else
+	    continue;
+	} else
+	  continue;
+
+	/* cache type */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/type", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2, sizeof(str2), fd)) {
+	    fclose(fd);
+	    if (!strncmp(str2, "Data", 4))
+	      type = HWLOC_OBJ_CACHE_DATA;
+	    else if (!strncmp(str2, "Unified", 7))
+	      type = HWLOC_OBJ_CACHE_UNIFIED;
+	    else if (!strncmp(str2, "Instruction", 11))
+	      type = HWLOC_OBJ_CACHE_INSTRUCTION;
+	    else
+	      continue;
+	  } else {
+	    fclose(fd);
+	    continue;
+	  }
+	} else
+	  continue;
+
+	/* get the cache size */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/size", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    kB = atol(str2); /* in kB */
+	  fclose(fd);
+	}
+
+	/* get the line size */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    linesize = atol(str2); /* in bytes */
+	  fclose(fd);
+	}
+
+	/* get the number of sets and lines per tag.
+	 * don't take the associativity directly in "ways_of_associativity" because
+	 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
+	 */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    sets = atol(str2);
+	  fclose(fd);
+	}
+	sprintf(mappath, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    lines_per_tag = atol(str2);
+	  fclose(fd);
+	}
+
+	sprintf(mappath, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
+	cacheset = hwloc_parse_cpumap(mappath, data->root_fd);
+        if (cacheset) {
+	  hwloc_bitmap_andnot(cacheset, cacheset, unknownset);
+          if (hwloc_bitmap_weight(cacheset) < 1) {
+            /* mask is wrong (useful for many itaniums) */
+            if (savedcoreset)
+              /* assume it's a core-specific cache */
+              hwloc_bitmap_copy(cacheset, savedcoreset);
+            else
+              /* assumes it's not shared */
+              hwloc_bitmap_only(cacheset, i);
+          }
+
+          if (hwloc_bitmap_first(cacheset) == i) {
+            /* first cpu in this cache, add the cache */
+            struct hwloc_obj *cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+            cache->attr->cache.size = kB << 10;
+            cache->attr->cache.depth = depth+1;
+            cache->attr->cache.linesize = linesize;
+	    cache->attr->cache.type = type;
+	    if (!linesize || !lines_per_tag || !sets)
+	      cache->attr->cache.associativity = 0; /* unknown */
+	    else if (sets == 1)
+	      cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
+	    else
+	      cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
+            cache->cpuset = cacheset;
+            hwloc_debug_1arg_bitmap("cache depth %d has cpuset %s\n",
+                       depth, cacheset);
+            hwloc_insert_object_by_cpuset(topology, cache);
+            cacheset = NULL; /* don't free it */
+            ++caches_added;
+          }
+        }
+        hwloc_bitmap_free(cacheset);
+      }
+      hwloc_bitmap_free(coreset);
+    }
+  hwloc_bitmap_foreach_end();
+
+  /* actually insert in the tree now that package cpusets have been fixed-up */
+  while (packages) {
+    hwloc_obj_t next = packages->next_cousin;
+    packages->next_cousin = NULL;
+    hwloc_insert_object_by_cpuset(topology, packages);
+    packages = next;
+  }
+
+  if (0 == caches_added)
+    look_powerpc_device_tree(topology, data);
+
+  hwloc_bitmap_free(cpuset);
+  hwloc_bitmap_free(unknownset);
+
+  return 0;
+}
+
+
+
+/****************************************
+ ****** cpuinfo Topology Discovery ******
+ ****************************************/
+
+static int
+hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("vendor_id", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("cpu family", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  } else if (!strcmp("stepping", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUStepping", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
+			       struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			       int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("vendor", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("family", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
+      || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("CPU implementer", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUImplementer", value);
+  } else if (!strcmp("CPU architecture", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
+  } else if (!strcmp("CPU variant", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVariant", value);
+  } else if (!strcmp("CPU part", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUPart", value);
+  } else if (!strcmp("CPU revision", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPURevision", value);
+  } else if (!strcmp("Hardware", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareName", value);
+  } else if (!strcmp("Revision", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareRevision", value);
+  } else if (!strcmp("Serial", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareSerial", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global)
+{
+  /* common fields */
+  if (!strcmp("cpu", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("platform", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformName", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformModel", value);
+  }
+  /* platform-specific fields */
+  else if (!strcasecmp("vendor", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformVendor", value);
+  } else if (!strcmp("Board ID", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
+  } else if (!strcmp("Board", prefix)
+	     || !strcasecmp("Machine", prefix)) {
+    /* machine and board are similar (and often more precise) than model above */
+    char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
+    if (*valuep)
+      free(*valuep);
+    *valuep = strdup(value);
+  } else if (!strcasecmp("Revision", prefix)
+	     || !strcmp("Hardware rev", prefix)) {
+    hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
+  } else if (!strcmp("SVR", prefix)) {
+    hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
+  } else if (!strcmp("PVR", prefix)) {
+    hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
+  }
+  /* don't match 'board*' because there's also "board l2" on some platforms */
+  return 0;
+}
+
+/*
+ * avr32: "chip type\t:"			=> OK
+ * blackfin: "model name\t:"			=> OK
+ * h8300: "CPU:"				=> OK
+ * m68k: "CPU:"					=> OK
+ * mips: "cpu model\t\t:"			=> OK
+ * openrisc: "CPU:"				=> OK
+ * sparc: "cpu\t\t:"				=> OK
+ * tile: "model name\t:"			=> OK
+ * unicore32: "Processor\t:"			=> OK
+ * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:"	=> "cpu" overwritten by "cpu model", no processor indexes
+ * cris: "cpu\t\t:" + "cpu model\t:"		=> only "cpu"
+ * frv: "CPU-Core:" + "CPU:"			=> only "CPU"
+ * mn10300: "cpu core   :" + "model name :"	=> only "model name"
+ * parisc: "cpu family\t:" + "cpu\t\t:"		=> only "cpu"
+ *
+ * not supported because of conflicts with other arch minor lines:
+ * m32r: "cpu family\t:"			=> KO (adding "cpu family" would break "blackfin")
+ * microblaze: "CPU-Family:"			=> KO
+ * sh: "cpu family\t:" + "cpu type\t:"		=> KO
+ * xtensa: "model\t\t:"				=> KO
+ */
+static int
+hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
+				  struct hwloc_obj_info_s **infos, unsigned *infos_count,
+				  int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("model name", prefix)
+      || !strcmp("Processor", prefix)
+      || !strcmp("chip type", prefix)
+      || !strcmp("cpu model", prefix)
+      || !strcasecmp("cpu", prefix)) {
+    /* keep the last one, assume it's more precise than the first one.
+     * we should have the Architecture keypair for basic information anyway.
+     */
+    char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
+    if (*valuep)
+      free(*valuep);
+    *valuep = strdup(value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
+			  const char *path,
+			  struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
+			  struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
+{
+  FILE *fd;
+  char *str = NULL;
+  char *endptr;
+  unsigned len;
+  unsigned allocated_Lprocs = 0;
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  unsigned numprocs = 0;
+  int curproc = -1;
+  int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
+
+  if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
+    {
+      hwloc_debug("could not open %s\n", path);
+      return -1;
+    }
+
+#      define PROCESSOR	"processor"
+#      define PACKAGEID "physical id" /* the longest one */
+#      define COREID "core id"
+  len = 128; /* vendor/model can be very long */
+  str = malloc(len);
+  hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
+  while (fgets(str,len,fd)!=NULL) {
+    unsigned long Ppkg, Pcore, Pproc;
+    char *end, *dot, *prefix, *value;
+    int noend = 0;
+
+    /* remove the ending \n */
+    end = strchr(str, '\n');
+    if (end)
+      *end = 0;
+    else
+      noend = 1;
+    /* if empty line, skip and reset curproc */
+    if (!*str) {
+      curproc = -1;
+      continue;
+    }
+    /* skip lines with no dot */
+    dot = strchr(str, ':');
+    if (!dot)
+      continue;
+    /* skip lines not starting with a letter */
+    if ((*str > 'z' || *str < 'a')
+	&& (*str > 'Z' || *str < 'A'))
+      continue;
+
+    /* mark the end of the prefix */
+    prefix = str;
+    end = dot;
+    while (end[-1] == ' ' || end[-1] == '	') end--; /* need a strrspn() */
+    *end = 0;
+    /* find beginning of value, its end is already marked */
+    value = dot+1 + strspn(dot+1, " 	");
+
+    /* defines for parsing numbers */
+#   define getprocnb_begin(field, var)					\
+    if (!strcmp(field,prefix)) {					\
+      var = strtoul(value,&endptr,0);					\
+      if (endptr==value) {						\
+	hwloc_debug("no number in "field" field of %s\n", path);	\
+	goto err;							\
+      } else if (var==ULONG_MAX) {					\
+	hwloc_debug("too big "field" number in %s\n", path); 		\
+	goto err;							\
+      }									\
+      hwloc_debug(field " %lu\n", var)
+#   define getprocnb_end()						\
+    }
+    /* actually parse numbers */
+    getprocnb_begin(PROCESSOR, Pproc);
+    curproc = numprocs++;
+    if (numprocs > allocated_Lprocs) {
+      if (!allocated_Lprocs)
+	allocated_Lprocs = 8;
+      else
+        allocated_Lprocs *= 2;
+      Lprocs = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
+    }
+    Lprocs[curproc].Pproc = Pproc;
+    Lprocs[curproc].Pcore = -1;
+    Lprocs[curproc].Ppkg = -1;
+    Lprocs[curproc].Lcore = -1;
+    Lprocs[curproc].Lpkg = -1;
+    Lprocs[curproc].infos = NULL;
+    Lprocs[curproc].infos_count = 0;
+    getprocnb_end() else
+    getprocnb_begin(PACKAGEID, Ppkg);
+    Lprocs[curproc].Ppkg = Ppkg;
+    getprocnb_end() else
+    getprocnb_begin(COREID, Pcore);
+    Lprocs[curproc].Pcore = Pcore;
+    getprocnb_end() else {
+
+      /* architecture specific or default routine for parsing cpumodel */
+      if (!parse_cpuinfo_func) {
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
+	if (*data->utsname.machine) {
+	  /* x86_32 x86_64 k1om => x86 */
+	  if (!strcmp(data->utsname.machine, "x86_64")
+	      || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
+	      || !strcmp(data->utsname.machine, "k1om"))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
+	  /* ia64 */
+	  else if (!strcmp(data->utsname.machine, "ia64"))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
+	  /* arm */
+	  else if (!strncmp(data->utsname.machine, "arm", 3))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
+	  else if (!strncmp(data->utsname.machine, "ppc", 3)
+		   || !strncmp(data->utsname.machine, "power", 5))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
+	}
+      }
+      /* we can't assume that we already got a processor index line:
+       * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
+       * tile has a global section with model name before the list of processor lines.
+       */
+      parse_cpuinfo_func(prefix, value,
+			 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
+			 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
+			 curproc < 0);
+    }
+
+    if (noend) {
+      /* ignore end of line */
+      if (fscanf(fd,"%*[^\n]") == EOF)
+	break;
+      getc(fd);
+    }
+  }
+  fclose(fd);
+  free(str);
+
+  *Lprocs_p = Lprocs;
+  return numprocs;
+
+ err:
+  fclose(fd);
+  free(str);
+  free(Lprocs);
+  return -1;
+}
+
+static void
+hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
+			 struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
+{
+  if (Lprocs) {
+    unsigned i;
+    for(i=0; i<numprocs; i++) {
+      hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
+    }
+    free(Lprocs);
+  }
+  hwloc__free_infos(global_infos, global_infos_count);
+}
+
+static int
+look_cpuinfo(struct hwloc_topology *topology,
+	     struct hwloc_linux_backend_data_s *data,
+	     const char *path)
+{
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  struct hwloc_obj_info_s *global_infos = NULL;
+  unsigned global_infos_count = 0;
+  /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
+  unsigned *Lcore_to_Pcore;
+  unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
+  unsigned *Lpkg_to_Ppkg;
+  int _numprocs;
+  unsigned numprocs;
+  unsigned numpkgs=0;
+  unsigned numcores=0;
+  unsigned long Lproc;
+  unsigned missingpkg;
+  unsigned missingcore;
+  unsigned i,j;
+
+  /* parse the entire cpuinfo first, fill the Lprocs array and numprocs */
+  _numprocs = hwloc_linux_parse_cpuinfo(data, path, &Lprocs, &global_infos, &global_infos_count);
+
+
+  /* setup root info */
+  hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+		    &global_infos, &global_infos_count);
+
+
+  if (_numprocs <= 0)
+    /* found no processor */
+    return -1;
+  numprocs = _numprocs;
+
+  /* initialize misc arrays, there can be at most numprocs entries */
+  Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
+  Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
+  Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
+  for (i = 0; i < numprocs; i++) {
+    Lcore_to_Pcore[i] = -1;
+    Lcore_to_Ppkg[i] = -1;
+    Lpkg_to_Ppkg[i] = -1;
+  }
+
+  /* create PU objects */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    unsigned long Pproc = Lprocs[Lproc].Pproc;
+    hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
+    obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_only(obj->cpuset, Pproc);
+    hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
+			     Lproc, Pproc, obj->cpuset);
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+
+  topology->support.discovery->pu = 1;
+
+  hwloc_debug("%s", "\n * Topology summary *\n");
+  hwloc_debug("%u processors)\n", numprocs);
+
+  /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    long Ppkg = Lprocs[Lproc].Ppkg;
+    if (Ppkg != -1) {
+      unsigned long Pproc = Lprocs[Lproc].Pproc;
+      for (i=0; i<numpkgs; i++)
+	if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
+	  break;
+      Lprocs[Lproc].Lpkg = i;
+      hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
+      if (i==numpkgs) {
+	Lpkg_to_Ppkg[numpkgs] = Ppkg;
+	numpkgs++;
+      }
+    }
+  }
+  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+   * provide bogus information. We should rather drop it. */
+  missingpkg=0;
+  for(j=0; j<numprocs; j++)
+    if (Lprocs[i].Ppkg == -1) {
+      missingpkg=1;
+      break;
+    }
+  /* create package objects */
+  hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
+  if (!missingpkg && numpkgs>0) {
+    for (i = 0; i < numpkgs; i++) {
+      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
+      int doneinfos = 0;
+      obj->cpuset = hwloc_bitmap_alloc();
+      for(j=0; j<numprocs; j++)
+	if ((unsigned) Lprocs[j].Lpkg == i) {
+	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+	  if (!doneinfos) {
+	    hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
+	    doneinfos = 1;
+	  }
+	}
+      hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    hwloc_debug("%s", "\n");
+  }
+
+  /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    long Pcore = Lprocs[Lproc].Pcore;
+    if (Pcore != -1) {
+      for (i=0; i<numcores; i++)
+	if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
+	  break;
+      Lprocs[Lproc].Lcore = i;
+      if (i==numcores) {
+	Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
+	Lcore_to_Pcore[numcores] = Pcore;
+	numcores++;
+      }
+    }
+  }
+  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+   * provide bogus information. We should rather drop it. */
+  missingcore=0;
+  for(j=0; j<numprocs; j++)
+    if (Lprocs[i].Pcore == -1) {
+      missingcore=1;
+      break;
+    }
+  /* create Core objects */
+  hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
+  if (!missingcore && numcores>0) {
+    for (i = 0; i < numcores; i++) {
+      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
+      obj->cpuset = hwloc_bitmap_alloc();
+      for(j=0; j<numprocs; j++)
+	if ((unsigned) Lprocs[j].Lcore == i)
+	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+      hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    hwloc_debug("%s", "\n");
+  }
+
+  free(Lcore_to_Pcore);
+  free(Lcore_to_Ppkg);
+  free(Lpkg_to_Ppkg);
+
+  hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+
+  look_powerpc_device_tree(topology, data);
+  return 0;
+}
+
+
+
+/*************************************
+ ****** Main Topology Discovery ******
+ *************************************/
+
+static void
+hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
+{
+  FILE *file;
+  char line[64], *tmp, *end;
+  file = hwloc_fopen("/proc/elog", "r", data->root_fd);
+  if (!file)
+    return;
+  if (!fgets(line, sizeof(line), file))
+    goto out_with_file;
+  if (strncmp(line, "Card ", 5))
+    goto out_with_file;
+  tmp = line + 5;
+  end = strchr(tmp, ':');
+  if (!end)
+    goto out_with_file;
+  *end = '\0';
+  hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
+
+ out_with_file:
+  fclose(file);
+}
+
+static void
+hwloc_linux_fallback_pu_level(struct hwloc_topology *topology)
+{
+  if (topology->is_thissystem)
+    hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+  else
+    /* fsys-root but not this system, no way, assume there's just 1
+     * processor :/ */
+    hwloc_setup_pu_level(topology, 1);
+}
+
+static void
+hwloc_gather_system_info(struct hwloc_topology *topology,
+			 struct hwloc_linux_backend_data_s *data)
+{
+  FILE *file;
+  char line[128]; /* enough for utsname fields */
+  const char *env;
+
+  /* initialize to something sane */
+  memset(&data->utsname, 0, sizeof(data->utsname));
+
+  /* read thissystem info */
+  if (topology->is_thissystem)
+    uname(&data->utsname);
+
+  /* overwrite with optional /proc/hwloc-nofile-info */
+  file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
+  if (file) {
+    while (fgets(line, sizeof(line), file)) {
+      char *tmp = strchr(line, '\n');
+      if (!strncmp("OSName: ", line, 8)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
+	data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
+      } else if (!strncmp("OSRelease: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
+	data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
+      } else if (!strncmp("OSVersion: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
+	data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
+      } else if (!strncmp("HostName: ", line, 10)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
+	data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
+      } else if (!strncmp("Architecture: ", line, 14)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
+	data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
+      } else {
+	hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
+	/* ignored */
+      }
+    }
+    fclose(file);
+  }
+
+  env = getenv("HWLOC_DUMP_NOFILE_INFO");
+  if (env && *env) {
+    file = fopen(env, "w");
+    if (file) {
+      if (*data->utsname.sysname)
+	fprintf(file, "OSName: %s\n", data->utsname.sysname);
+      if (*data->utsname.release)
+	fprintf(file, "OSRelease: %s\n", data->utsname.release);
+      if (*data->utsname.version)
+	fprintf(file, "OSVersion: %s\n", data->utsname.version);
+      if (*data->utsname.nodename)
+	fprintf(file, "HostName: %s\n", data->utsname.nodename);
+      if (*data->utsname.machine)
+	fprintf(file, "Architecture: %s\n", data->utsname.machine);
+      fclose(file);
+    }
+  }
+}
+
+static int
+hwloc_look_linuxfs(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  unsigned nbnodes;
+  char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
+  int err;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_gather_system_info(topology, data);
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  /* Gather the list of admin-disabled cpus and mems */
+  hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, data->root_fd);
+  if (cgroup_mntpnt || cpuset_mntpnt) {
+    cpuset_name = hwloc_read_linux_cpuset_name(data->root_fd, topology->pid);
+    if (cpuset_name) {
+      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
+      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
+    }
+    free(cgroup_mntpnt);
+    free(cpuset_mntpnt);
+  }
+
+    /* Get the machine memory attributes */
+    hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
+
+    /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
+    if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
+      look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
+
+    /* if we found some numa nodes, the machine object has no local memory */
+    if (nbnodes) {
+      unsigned i;
+      topology->levels[0][0]->memory.local_memory = 0;
+      if (topology->levels[0][0]->memory.page_types)
+        for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
+          topology->levels[0][0]->memory.page_types[i].count = 0;
+    }
+
+    /* Gather the list of cpus now */
+    if (getenv("HWLOC_LINUX_USE_CPUINFO")
+	|| (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
+	/* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
+	 * or not containing anything interesting */
+      err = look_cpuinfo(topology, data, "/proc/cpuinfo");
+      if (err < 0)
+	hwloc_linux_fallback_pu_level(topology);
+
+    } else {
+      struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+      struct hwloc_obj_info_s *global_infos = NULL;
+      unsigned global_infos_count = 0;
+      int numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
+      if (numprocs <= 0)
+	Lprocs = NULL;
+      if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
+        if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
+	  /* sysfs but we failed to read cpu topology, fallback */
+	  hwloc_linux_fallback_pu_level(topology);
+      hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+			&global_infos, &global_infos_count);
+      hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+    }
+
+  /* Gather DMI info */
+  hwloc__get_dmi_id_info(data, topology->levels[0][0]);
+  if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
+    hwloc__get_firmware_dmi_memory_info(topology, data);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
+  if (cpuset_name) {
+    hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
+    free(cpuset_name);
+  }
+
+  hwloc__linux_get_mic_sn(topology, data);
+
+  /* data->utsname was filled with real uname or \0, we can safely pass it */
+  hwloc_add_uname_info(topology, &data->utsname);
+
+  return 1;
+}
+
+
+
+/****************************************
+ ***** Linux PCI backend callbacks ******
+ ****************************************
+ * Do not support changing the fsroot (use sysfs)
+ */
+
+static hwloc_obj_t
+hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+  obj->name = strdup(name);
+  obj->logical_index = -1;
+  obj->attr->osdev.type = type;
+
+  hwloc_insert_object_by_parent(topology, pcidev, obj);
+  /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+
+  return obj;
+}
+
+typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
+
+/* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
+
+static void
+hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
+{
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+  char path[128];
+  struct stat st;
+
+  data->deprecated_classlinks_model = -1;
+
+  dir = hwloc_opendir("/sys/class/net", root_fd);
+  if (!dir)
+    return;
+  while ((dirent = readdir(dir)) != NULL) {
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
+      continue;
+    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
+    if (hwloc_stat(path, &st, root_fd) == 0) {
+      data->deprecated_classlinks_model = 0;
+      goto out;
+    }
+    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
+    if (hwloc_stat(path, &st, root_fd) == 0) {
+      data->deprecated_classlinks_model = 1;
+      goto out;
+    }
+  }
+out:
+  closedir(dir);
+}
+
+/* class objects that are immediately below pci devices:
+ * look for objects of the given classname below a sysfs (pcidev) directory
+ */
+static int
+hwloc_linux_class_readdir(struct hwloc_backend *backend,
+			  struct hwloc_obj *pcidev, const char *devicepath,
+			  hwloc_obj_osdev_type_t type, const char *classname,
+			  hwloc_linux_class_fillinfos_t fillinfo)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  size_t classnamelen = strlen(classname);
+  char path[256];
+  DIR *dir;
+  struct dirent *dirent;
+  hwloc_obj_t obj;
+  int res = 0, err;
+
+  if (data->deprecated_classlinks_model == -2)
+    hwloc_linux_check_deprecated_classlinks_model(data);
+
+  if (data->deprecated_classlinks_model != 1) {
+    /* modern sysfs: <device>/<class>/<name> */
+    struct stat st;
+    snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
+
+    /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
+     * make sure <device>/<class> is a directory to avoid this case.
+     */
+    err = hwloc_lstat(path, &st, root_fd);
+    if (err < 0 || !S_ISDIR(st.st_mode))
+      goto trydeprecated;
+
+    dir = hwloc_opendir(path, root_fd);
+    if (dir) {
+      data->deprecated_classlinks_model = 0;
+      while ((dirent = readdir(dir)) != NULL) {
+	if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	  continue;
+	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
+	if (fillinfo) {
+	  snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
+	  fillinfo(backend, obj, path);
+	}
+	res++;
+      }
+      closedir(dir);
+      return res;
+    }
+  }
+
+trydeprecated:
+  if (data->deprecated_classlinks_model != 0) {
+    /* deprecated sysfs: <device>/<class>:<name> */
+    dir = hwloc_opendir(devicepath, root_fd);
+    if (dir) {
+      while ((dirent = readdir(dir)) != NULL) {
+	if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
+	  continue;
+	data->deprecated_classlinks_model = 1;
+	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
+	if (fillinfo) {
+	  snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
+	  fillinfo(backend, obj, path);
+	}
+	res++;
+      }
+      closedir(dir);
+      return res;
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * look for net objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
+				struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  struct stat st;
+  char path[256];
+  snprintf(path, sizeof(path), "%s/address", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char address[128];
+    if (fgets(address, sizeof(address), fd)) {
+      char *eol = strchr(address, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "Address", address);
+    }
+    fclose(fd);
+  }
+  snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
+  if (!hwloc_stat(path, &st, root_fd)) {
+    snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char hexid[16];
+      if (fgets(hexid, sizeof(hexid), fd)) {
+	char *eoid;
+	unsigned long port;
+	port = strtoul(hexid, &eoid, 0);
+	if (eoid != hexid) {
+	  char portstr[16];
+	  snprintf(portstr, sizeof(portstr), "%ld", port+1);
+	  hwloc_obj_add_info(obj, "Port", portstr);
+	}
+      }
+      fclose(fd);
+    }
+  }
+}
+
+static int
+hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
+}
+
+/*
+ * look for infiniband objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
+				       struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+  unsigned i,j;
+
+  snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char guidvalue[20];
+    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+      size_t len;
+      len = strspn(guidvalue, "0123456789abcdefx:");
+      assert(len == 19);
+      guidvalue[len] = '\0';
+      hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char guidvalue[20];
+    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+      size_t len;
+      len = strspn(guidvalue, "0123456789abcdefx:");
+      assert(len == 19);
+      guidvalue[len] = '\0';
+      hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
+    }
+    fclose(fd);
+  }
+
+  for(i=1; ; i++) {
+    snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char statevalue[2];
+      if (fgets(statevalue, sizeof(statevalue), fd)) {
+	char statename[32];
+	statevalue[1] = '\0'; /* only keep the first byte/digit */
+	snprintf(statename, sizeof(statename), "Port%uState", i);
+	hwloc_obj_add_info(obj, statename, statevalue);
+      }
+      fclose(fd);
+    } else {
+      /* no such port */
+      break;
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char lidvalue[11];
+      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+	char lidname[32];
+	size_t len;
+	len = strspn(lidvalue, "0123456789abcdefx");
+	lidvalue[len] = '\0';
+	snprintf(lidname, sizeof(lidname), "Port%uLID", i);
+	hwloc_obj_add_info(obj, lidname, lidvalue);
+      }
+      fclose(fd);
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char lidvalue[11];
+      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+	char lidname[32];
+	size_t len;
+	len = strspn(lidvalue, "0123456789");
+	lidvalue[len] = '\0';
+	snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
+	hwloc_obj_add_info(obj, lidname, lidvalue);
+      }
+      fclose(fd);
+    }
+
+    for(j=0; ; j++) {
+      snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
+      fd = hwloc_fopen(path, "r", root_fd);
+      if (fd) {
+	char gidvalue[40];
+	if (fgets(gidvalue, sizeof(gidvalue), fd)) {
+	  char gidname[32];
+	  size_t len;
+	  len = strspn(gidvalue, "0123456789abcdefx:");
+	  assert(len == 39);
+	  gidvalue[len] = '\0';
+	  if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
+	    /* only keep initialized GIDs */
+	    snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
+	    hwloc_obj_add_info(obj, gidname, gidvalue);
+	  }
+	}
+	fclose(fd);
+      } else {
+	/* no such port */
+	break;
+      }
+    }
+  }
+}
+
+static int
+hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
+				     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
+}
+
+/* look for dma objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
+}
+
+/* look for drm objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
+
+  /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
+
+  /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
+   * so we could create a OS device for each PCI devices with such a field.
+   * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
+   */
+}
+
+/*
+ * look for block objects below a pcidev in sysfs
+ */
+
+static void
+hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
+				  struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+  char line[128];
+  char vendor[64] = "";
+  char model[64] = "";
+  char serial[64] = "";
+  char revision[64] = "";
+  char blocktype[64] = "";
+  unsigned major_id, minor_id;
+  char *tmp;
+
+  snprintf(path, sizeof(path), "%s/dev", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (!fd)
+    return;
+
+  if (NULL == fgets(line, sizeof(line), fd)) {
+    fclose(fd);
+    return;
+  }
+  fclose(fd);
+
+  if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
+    return;
+  tmp = strchr(line, '\n');
+  if (tmp)
+    *tmp = '\0';
+  hwloc_obj_add_info(obj, "LinuxDeviceID", line);
+
+#ifdef HAVE_LIBUDEV_H
+  if (data->udev) {
+    struct udev_device *dev;
+    const char *prop;
+    dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
+    if (!dev)
+      return;
+    prop = udev_device_get_property_value(dev, "ID_VENDOR");
+    if (prop)
+      strcpy(vendor, prop);
+    prop = udev_device_get_property_value(dev, "ID_MODEL");
+    if (prop)
+      strcpy(model, prop);
+    prop = udev_device_get_property_value(dev, "ID_REVISION");
+    if (prop)
+      strcpy(revision, prop);
+    prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+    if (prop)
+      strcpy(serial, prop);
+    prop = udev_device_get_property_value(dev, "ID_TYPE");
+    if (prop)
+      strcpy(blocktype, prop);
+
+    udev_device_unref(dev);
+  } else
+    /* fallback to reading files, works with any fsroot */
+#endif
+ {
+  snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (!fd)
+    return;
+
+  while (NULL != fgets(line, sizeof(line), fd)) {
+    tmp = strchr(line, '\n');
+    if (tmp)
+      *tmp = '\0';
+    if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
+      strcpy(vendor, line+strlen("E:ID_VENDOR="));
+    } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
+      strcpy(model, line+strlen("E:ID_MODEL="));
+    } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
+      strcpy(revision, line+strlen("E:ID_REVISION="));
+    } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
+      strcpy(serial, line+strlen("E:ID_SERIAL_SHORT="));
+    } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
+      strcpy(blocktype, line+strlen("E:ID_TYPE="));
+    }
+  }
+  fclose(fd);
+ }
+
+  /* clear fake "ATA" vendor name */
+  if (!strcasecmp(vendor, "ATA"))
+    *vendor = '\0';
+  /* overwrite vendor name from model when possible */
+  if (!*vendor) {
+    if (!strncasecmp(model, "wd", 2))
+      strcpy(vendor, "Western Digital");
+    else if (!strncasecmp(model, "st", 2))
+      strcpy(vendor, "Seagate");
+    else if (!strncasecmp(model, "samsung", 7))
+      strcpy(vendor, "Samsung");
+    else if (!strncasecmp(model, "sandisk", 7))
+      strcpy(vendor, "SanDisk");
+    else if (!strncasecmp(model, "toshiba", 7))
+      strcpy(vendor, "Toshiba");
+  }
+
+  if (*vendor)
+    hwloc_obj_add_info(obj, "Vendor", vendor);
+  if (*model)
+    hwloc_obj_add_info(obj, "Model", model);
+  if (*revision)
+    hwloc_obj_add_info(obj, "Revision", revision);
+  if (*serial)
+    hwloc_obj_add_info(obj, "SerialNumber", serial);
+
+  if (!strcmp(blocktype, "disk"))
+    hwloc_obj_add_info(obj, "Type", "Disk");
+  else if (!strcmp(blocktype, "tape"))
+    hwloc_obj_add_info(obj, "Type", "Tape");
+  else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
+    hwloc_obj_add_info(obj, "Type", "Removable Media Device");
+  else /* generic, usb mass storage/rbc, usb mass storage/scsi */
+    hwloc_obj_add_info(obj, "Type", "Other");
+}
+
+/* block class objects are in
+ * host%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * ide%d/%d.%d/
+ * below pci devices */
+static int
+hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
+				    struct hwloc_obj *pcidev, char *path, size_t pathlen)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *hostdir, *portdir, *targetdir;
+  struct dirent *hostdirent, *portdirent, *targetdirent;
+  size_t hostdlen, portdlen, targetdlen;
+  int dummy;
+  int res = 0;
+
+  hostdir = hwloc_opendir(path, root_fd);
+  if (!hostdir)
+    return 0;
+
+  while ((hostdirent = readdir(hostdir)) != NULL) {
+    if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
+    {
+      /* found host%d/port-%d:%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], hostdirent->d_name);
+      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+      portdir = hwloc_opendir(path, root_fd);
+      if (!portdir)
+	continue;
+      while ((portdirent = readdir(portdir)) != NULL) {
+	if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
+	  /* found host%d/port-%d:%d/end_device-%d:%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], portdirent->d_name);
+	  pathlen += portdlen = 1+strlen(portdirent->d_name);
+	  res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+	  /* restore parent path */
+	  pathlen -= portdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(portdir);
+      /* restore parent path */
+      pathlen -= hostdlen;
+      path[pathlen] = '\0';
+      continue;
+    } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
+      /* found host%d/target%d:%d:%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], hostdirent->d_name);
+      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+      targetdir = hwloc_opendir(path, root_fd);
+      if (!targetdir)
+	continue;
+      while ((targetdirent = readdir(targetdir)) != NULL) {
+	if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
+	  continue;
+	/* found host%d/target%d:%d:%d/%d:%d:%d:%d */
+	path[pathlen] = '/';
+	strcpy(&path[pathlen+1], targetdirent->d_name);
+	pathlen += targetdlen = 1+strlen(targetdirent->d_name);
+	/* lookup block class for real */
+	res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
+	/* restore parent path */
+	pathlen -= targetdlen;
+	path[pathlen] = '\0';
+      }
+      closedir(targetdir);
+      /* restore parent path */
+      pathlen -= hostdlen;
+      path[pathlen] = '\0';
+    }
+  }
+  closedir(hostdir);
+
+  return res;
+}
+
+static int
+hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
+			       struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  size_t pathlen;
+  DIR *devicedir, *hostdir;
+  struct dirent *devicedirent, *hostdirent;
+  size_t devicedlen, hostdlen;
+  char path[256];
+  int dummy;
+  int res = 0;
+
+  strcpy(path, pcidevpath);
+  pathlen = strlen(path);
+
+  devicedir = hwloc_opendir(pcidevpath, root_fd);
+  if (!devicedir)
+    return 0;
+
+  while ((devicedirent = readdir(devicedir)) != NULL) {
+    if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
+      /* found ide%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      hostdir = hwloc_opendir(path, root_fd);
+      if (!hostdir)
+	continue;
+      while ((hostdirent = readdir(hostdir)) != NULL) {
+	if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
+	  /* found ide%d/%d.%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], hostdirent->d_name);
+	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+	  /* lookup block class for real */
+	  res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
+	  /* restore parent path */
+	  pathlen -= hostdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(hostdir);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
+      /* found host%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
+      /* found ata%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      hostdir = hwloc_opendir(path, root_fd);
+      if (!hostdir)
+	continue;
+      while ((hostdirent = readdir(hostdir)) != NULL) {
+	if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
+	  /* found ata%d/host%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], hostdirent->d_name);
+	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+	  /* lookup block class for real */
+          res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+	  /* restore parent path */
+	  pathlen -= hostdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(hostdir);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    }
+  }
+  closedir(devicedir);
+
+  return res;
+}
+
+static void
+hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
+				struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+
+  hwloc_obj_add_info(obj, "CoProcType", "MIC");
+
+  snprintf(path, sizeof(path), "%s/family", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char family[64];
+    if (fgets(family, sizeof(family), fd)) {
+      char *eol = strchr(family, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICFamily", family);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/sku", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char sku[64];
+    if (fgets(sku, sizeof(sku), fd)) {
+      char *eol = strchr(sku, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICSKU", sku);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char sn[64];
+    if (fgets(sn, sizeof(sn), fd)) {
+      char *eol = strchr(sn, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICSerialNumber", sn);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char string[10];
+    if (fgets(string, sizeof(string), fd)) {
+      unsigned long count = strtoul(string, NULL, 16);
+      snprintf(string, sizeof(string), "%lu", count);
+      hwloc_obj_add_info(obj, "MICActiveCores", string);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/memsize", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char string[20];
+    if (fgets(string, sizeof(string), fd)) {
+      unsigned long count = strtoul(string, NULL, 16);
+      snprintf(string, sizeof(string), "%lu", count);
+      hwloc_obj_add_info(obj, "MICMemorySize", string);
+    }
+    fclose(fd);
+  }
+}
+
+static int
+hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
+}
+
+static int
+hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
+				   struct hwloc_obj *pcidev)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  char path[256];
+  struct stat st;
+  hwloc_obj_t obj;
+  unsigned idx;
+  int res = 0;
+
+  if (!data->mic_directlookup_id_max)
+    /* already tried, nothing to do */
+    return 0;
+
+  if (data->mic_directlookup_id_max == (unsigned) -1) {
+    /* never tried, find out the max id */
+    DIR *dir;
+    struct dirent *dirent;
+
+    /* make sure we never do this lookup again */
+    data->mic_directlookup_id_max = 0;
+
+    /* read the entire class and find the max id of mic%u dirents */
+    dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
+    if (!dir) {
+      dir = opendir("/sys/class/mic");
+      if (!dir)
+	return 0;
+    }
+    while ((dirent = readdir(dir)) != NULL) {
+      if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	continue;
+      if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
+	continue;
+      if (idx >= data->mic_directlookup_id_max)
+	data->mic_directlookup_id_max = idx+1;
+    }
+    closedir(dir);
+  }
+
+  /* now iterate over the mic ids and see if one matches our pcidev */
+  for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
+    snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
+	     idx, pcidev->attr->pcidev.bus,  pcidev->attr->pcidev.dev,  pcidev->attr->pcidev.func);
+    if (hwloc_stat(path, &st, root_fd) < 0)
+      continue;
+    snprintf(path, sizeof(path), "mic%u", idx);
+    obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
+    snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
+    hwloc_linux_mic_class_fillinfos(backend, obj, path);
+    res++;
+  }
+
+  return res;
+}
+
+/*
+ * backend callback for inserting objects inside a pci device
+ */
+static int
+hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+				      struct hwloc_obj *obj)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  char pcidevpath[256];
+  int res = 0;
+
+  /* this callback is only used in the libpci backend for now */
+  assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
+
+  snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
+
+  res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
+
+  if (data->mic_need_directlookup == -1) {
+    struct stat st;
+    if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
+	&& hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
+      /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
+       * do not have mic/mic%u symlinks to mic devices (old mic driver).
+       * if so, try from the mic class.
+       */
+      data->mic_need_directlookup = 1;
+    else
+      data->mic_need_directlookup = 0;
+  }
+  if (data->mic_need_directlookup)
+    res += hwloc_linux_directlookup_mic_class(backend, obj);
+  else
+    res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
+
+  return res;
+}
+
+/*
+ * backend callback for retrieving the location of a pci device
+ */
+static int
+hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
+				   struct hwloc_backend *caller __hwloc_attribute_unused,
+				   struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  char path[256];
+  FILE *file;
+  int err;
+
+  /* this callback is only used in the libpci backend for now */
+  assert(obj->type == HWLOC_OBJ_PCI_DEVICE
+	 || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
+
+  snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
+	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
+  file = hwloc_fopen(path, "r", data->root_fd);
+  if (file) {
+    err = hwloc_linux_parse_cpumap_file(file, cpuset);
+    fclose(file);
+    if (!err && !hwloc_bitmap_iszero(cpuset))
+      return 0;
+  }
+  return -1;
+}
+
+
+
+/*******************************
+ ******* Linux component *******
+ *******************************/
+
+static void
+hwloc_linux_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+#ifdef HAVE_OPENAT
+  close(data->root_fd);
+#endif
+#ifdef HAVE_LIBUDEV_H
+  if (data->udev)
+    udev_unref(data->udev);
+#endif
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
+				  const void *_data1,
+				  const void *_data2 __hwloc_attribute_unused,
+				  const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_linux_backend_data_s *data;
+  const char * fsroot_path = _data1;
+  int flags, root = -1;
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_linuxfs;
+  backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
+  backend->notify_new_object = hwloc_linux_backend_notify_new_object;
+  backend->disable = hwloc_linux_backend_disable;
+
+  /* default values */
+  data->is_real_fsroot = 1;
+  if (!fsroot_path)
+    fsroot_path = "/";
+
+#ifdef HAVE_OPENAT
+  root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
+  if (root < 0)
+    goto out_with_data;
+
+  if (strcmp(fsroot_path, "/")) {
+    backend->is_thissystem = 0;
+    data->is_real_fsroot = 0;
+  }
+
+  /* Since this fd stays open after hwloc returns, mark it as
+     close-on-exec so that children don't inherit it.  Stevens says
+     that we should GETFD before we SETFD, so we do. */
+  flags = fcntl(root, F_GETFD, 0);
+  if (-1 == flags ||
+      -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
+      close(root);
+      root = -1;
+      goto out_with_data;
+  }
+#else
+  if (strcmp(fsroot_path, "/")) {
+    errno = ENOSYS;
+    goto out_with_data;
+  }
+#endif
+  data->root_fd = root;
+
+#ifdef HAVE_LIBUDEV_H
+  data->udev = NULL;
+  if (data->is_real_fsroot) {
+    data->udev = udev_new();
+  }
+#endif
+
+  data->deprecated_classlinks_model = -2; /* never tried */
+  data->mic_need_directlookup = -1; /* not initialized */
+  data->mic_directlookup_id_max = -1; /* not initialized */
+
+  return backend;
+
+ out_with_data:
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_linux_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "linux",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_linux_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_linux_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_linux_disc_component
+};
+
+
+
+
+#ifdef HWLOC_HAVE_LINUXPCI
+
+/***********************************
+ ******* Linux PCI component *******
+ ***********************************/
+
+#define HWLOC_PCI_REVISION_ID 0x08
+#define HWLOC_PCI_CAP_ID_EXP 0x10
+#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
+
+static int
+hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_backend *tmpbackend;
+  hwloc_obj_t first_obj = NULL, last_obj = NULL;
+  int root_fd = -1;
+  DIR *dir;
+  struct dirent *dirent;
+  int res = 0;
+
+  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+    return 0;
+
+  if (hwloc_get_next_pcidev(topology, NULL)) {
+    hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
+    return 0;
+  }
+
+  /* hackily find the linux backend to steal its fsroot */
+  tmpbackend = topology->backends;
+  while (tmpbackend) {
+    if (tmpbackend->component == &hwloc_linux_disc_component) {
+      root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
+      hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
+      break;    }
+    tmpbackend = tmpbackend->next;
+  }
+  /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
+  if (root_fd >= 0)
+    root_fd = dup(root_fd);
+  else
+    root_fd = open("/", O_RDONLY | O_DIRECTORY);
+
+  dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
+  if (!dir)
+    goto out_with_rootfd;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    unsigned domain, bus, dev, func;
+    hwloc_obj_t obj;
+    struct hwloc_pcidev_attr_s *attr;
+    unsigned os_index;
+    char path[64];
+    char value[16];
+    size_t read;
+    FILE *file;
+
+    if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
+      continue;
+
+    os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
+    if (!obj)
+      break;
+    attr = &obj->attr->pcidev;
+
+    attr->domain = domain;
+    attr->bus = bus;
+    attr->dev = dev;
+    attr->func = func;
+
+    /* default (unknown) values */
+    attr->vendor_id = 0;
+    attr->device_id = 0;
+    attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
+    attr->revision = 0;
+    attr->subvendor_id = 0;
+    attr->subdevice_id = 0;
+    attr->linkspeed = 0;
+
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->vendor_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->device_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->class_id = strtoul(value, NULL, 16) >> 8;
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->subvendor_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->subdevice_id = strtoul(value, NULL, 16);
+    }
+
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+#define CONFIG_SPACE_CACHESIZE 256
+      unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
+      unsigned offset;
+
+      /* initialize the config space in case we fail to read it (missing permissions, etc). */
+      memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
+      read = fread(config_space_cache, 1, CONFIG_SPACE_CACHESIZE, file);
+      (void) read; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
+      fclose(file);
+
+      /* is this a bridge? */
+      hwloc_pci_prepare_bridge(obj, config_space_cache);
+
+      /* get the revision */
+      attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
+
+      /* try to get the link speed */
+      offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
+      if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
+	hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
+    }
+
+    if (first_obj)
+      last_obj->next_sibling = obj;
+    else
+      first_obj = obj;
+    last_obj = obj;
+  }
+
+  closedir(dir);
+
+  dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
+  if (dir) {
+    while ((dirent = readdir(dir)) != NULL) {
+      char path[64];
+      FILE *file;
+      if (dirent->d_name[0] == '.')
+	continue;
+      snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
+      file = hwloc_fopen(path, "r", root_fd);
+      if (file) {
+	unsigned domain, bus, dev;
+	if (fscanf(file, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+	  hwloc_obj_t obj = first_obj;
+	  while (obj) {
+	    if (obj->attr->pcidev.domain == domain
+		&& obj->attr->pcidev.bus == bus
+		&& obj->attr->pcidev.dev == dev
+		&& obj->attr->pcidev.func == 0) {
+	      hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
+	      break;
+	    }
+	    obj = obj->next_sibling;
+	  }
+	}
+	fclose(file);
+      }
+    }
+    closedir(dir);
+  }
+
+  res = hwloc_insert_pci_device_list(backend, first_obj);
+
+ out_with_rootfd:
+  close(root_fd);
+  return res;
+}
+
+static struct hwloc_backend *
+hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
+				     const void *_data1 __hwloc_attribute_unused,
+				     const void *_data2 __hwloc_attribute_unused,
+				     const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+
+  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+  backend->discover = hwloc_look_linuxfs_pci;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC,
+  "linuxpci",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_linuxpci_component_instantiate,
+  19, /* after pci */
+  NULL
+};
+
+const struct hwloc_component hwloc_linuxpci_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_linuxpci_disc_component
+};
+
+#endif /* HWLOC_HAVE_LINUXPCI */
diff --git a/ext/hwloc/hwloc/topology-noos.c b/ext/hwloc/hwloc/topology-noos.c
new file mode 100644
index 0000000..a926428
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-noos.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+static int
+hwloc_look_noos(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+static struct hwloc_backend *
+hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
+				 const void *_data1 __hwloc_attribute_unused,
+				 const void *_data2 __hwloc_attribute_unused,
+				 const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_noos;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_noos_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "no_os",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_noos_component_instantiate,
+  40, /* lower than native OS component, higher than globals */
+  NULL
+};
+
+const struct hwloc_component hwloc_noos_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_noos_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-opencl.cb b/ext/hwloc/hwloc/topology-opencl.cb
new file mode 100644
index 0000000..85057c7
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-opencl.cb
@@ -0,0 +1,346 @@
+/*
+ * Copyright © 2012-2014 Inria.  All rights reserved.
+ * Copyright © 2013 Université Bordeaux.  All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+
+/* private headers allowed for convenience because this plugin is built within hwloc */
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <CL/cl_ext.h>
+
+typedef enum hwloc_opencl_device_type_e {
+  HWLOC_OPENCL_DEVICE_AMD
+} hwloc_opencl_device_type_t;
+
+struct hwloc_opencl_backend_data_s {
+  unsigned nr_devices; /* -1 when unknown yet, first callback will setup */
+  struct hwloc_opencl_device_info_s {
+    hwloc_opencl_device_type_t type;
+
+    unsigned platformidx;
+    char platformname[64];
+    unsigned platformdeviceidx;
+    char devicename[64];
+    char devicevendor[64];
+    char devicetype[64];
+
+    unsigned computeunits;
+    unsigned long long globalmemsize;
+
+    union hwloc_opencl_device_info_u {
+      struct hwloc_opencl_device_info_amd_s {
+        unsigned pcidomain, pcibus, pcidev, pcifunc;
+      } amd;
+    } specific;
+  } * devices;
+};
+
+static void
+hwloc_opencl_query_devices(struct hwloc_opencl_backend_data_s *data)
+{
+  cl_platform_id *platform_ids = NULL;
+  cl_uint nr_platforms;
+  cl_device_id *device_ids = NULL;
+  cl_uint nr_devices, nr_total_devices, tmp;
+  cl_int clret;
+  unsigned curpfidx, curpfdvidx, i;
+
+  /* mark the number of devices as 0 in case we fail below,
+   * so that we don't try again later.
+   */
+  data->nr_devices = 0;
+
+  /* count platforms, allocate and get them */
+  clret = clGetPlatformIDs(0, NULL, &nr_platforms);
+  if (CL_SUCCESS != clret || !nr_platforms)
+    goto out;
+  hwloc_debug("%u OpenCL platforms\n", nr_platforms);
+  platform_ids = malloc(nr_platforms * sizeof(*platform_ids));
+  if (!platform_ids)
+    goto out;
+  clret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
+  if (CL_SUCCESS != clret || !nr_platforms)
+    goto out_with_platform_ids;
+
+  /* how many devices, total? */
+  tmp = 0;
+  for(i=0; i<nr_platforms; i++) {
+    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nr_devices);
+    if (CL_SUCCESS != clret)
+      goto out_with_platform_ids;
+    tmp += nr_devices;
+  }
+  nr_total_devices = tmp;
+  hwloc_debug("%u OpenCL devices total\n", nr_total_devices);
+  /* allocate structs */
+  device_ids = malloc(nr_total_devices * sizeof(*device_ids));
+  data->devices = malloc(nr_total_devices * sizeof(*data->devices));
+  if (!data->devices || !device_ids)
+    goto out_with_device_ids;
+  /* actually query device ids */
+  tmp = 0;
+  for(i=0; i<nr_platforms; i++) {
+    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nr_total_devices - tmp, device_ids + tmp, &nr_devices);
+    if (CL_SUCCESS != clret)
+      goto out_with_device_ids;
+    tmp += nr_devices;
+  }
+
+  /* query individual devices */
+  curpfidx = 0;
+  curpfdvidx = 0;
+  for(i=0; i<nr_total_devices; i++) {
+    struct hwloc_opencl_device_info_s *info = &data->devices[data->nr_devices];
+    cl_platform_id platform_id = 0;
+    cl_device_type type;
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+    cl_device_topology_amd amdtopo;
+#endif
+    cl_ulong globalmemsize;
+    cl_uint computeunits;
+
+    hwloc_debug("Looking device %p\n", device_ids[i]);
+
+    info->platformname[0] = '\0';
+    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL);
+    if (CL_SUCCESS != clret)
+      continue;
+    clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(info->platformname), info->platformname, NULL);
+
+    info->devicename[0] = '\0';
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_BOARD_NAME_AMD, sizeof(info->devicename), info->devicename, NULL);
+#else
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(info->devicename), info->devicename, NULL);
+#endif
+    info->devicevendor[0] = '\0';
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, sizeof(info->devicevendor), info->devicevendor, NULL);
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL);
+    switch (type) {
+    case CL_DEVICE_TYPE_CPU: /* FIXME: cannot happen in PCI devices? */
+      strcpy(info->devicetype, "CPU");
+      break;
+    case CL_DEVICE_TYPE_GPU:
+      strcpy(info->devicetype, "GPU");
+      break;
+    case CL_DEVICE_TYPE_ACCELERATOR:
+      strcpy(info->devicetype, "Accelerator");
+      break;
+    default:
+      strcpy(info->devicetype, "Unknown");
+      break;
+    }
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemsize), &globalmemsize, NULL);
+    info->globalmemsize = globalmemsize / 1024;
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeunits), &computeunits, NULL);
+    info->computeunits = computeunits;
+
+    hwloc_debug("platform %s device %s vendor %s type %s\n", info->platformname, info->devicename, info->devicevendor, info->devicetype);
+
+    /* find our indexes */
+    while (platform_id != platform_ids[curpfidx]) {
+      curpfidx++;
+      curpfdvidx = 0;
+    }
+    info->platformidx = curpfidx;
+    info->platformdeviceidx = curpfdvidx;
+    curpfdvidx++;
+
+    hwloc_debug("This is opencl%dd%d\n", info->platformidx, info->platformdeviceidx);
+
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+    if (CL_SUCCESS != clret) {
+      hwloc_debug("no AMD-specific device information: %d\n", clret);
+      continue;
+    }
+    if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+      hwloc_debug("not a PCIe device: %u\n", amdtopo.raw.type);
+      continue;
+    }
+
+    info->type = HWLOC_OPENCL_DEVICE_AMD;
+    info->specific.amd.pcidomain = 0;
+    info->specific.amd.pcibus = amdtopo.pcie.bus;
+    info->specific.amd.pcidev = amdtopo.pcie.device;
+    info->specific.amd.pcifunc = amdtopo.pcie.function;
+
+    hwloc_debug("OpenCL device on PCI 0000:%02x:%02x.%u\n", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+
+    /* validate this device */
+    data->nr_devices++;
+#endif /* HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+  }
+  free(device_ids);
+  free(platform_ids);
+  return;
+
+out_with_device_ids:
+  free(device_ids);
+  free(data->devices);
+  data->devices = NULL;
+out_with_platform_ids:
+  free(platform_ids);
+out:
+  return;
+}
+
+static int
+hwloc_opencl_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+				       struct hwloc_obj *pcidev)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_opencl_backend_data_s *data = backend->private_data;
+  unsigned i;
+
+  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+    return 0;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    hwloc_debug("%s", "\nno OpenCL detection (not thissystem)\n");
+    return 0;
+  }
+
+  if (HWLOC_OBJ_PCI_DEVICE != pcidev->type)
+    return 0;
+
+  if (data->nr_devices == (unsigned) -1) {
+    /* first call, lookup all devices */
+    hwloc_opencl_query_devices(data);
+    /* if it fails, data->nr_devices = 0 so we won't do anything below and in next callbacks */
+  }
+
+  if (!data->nr_devices)
+    /* found no devices */
+    return 0;
+
+  /* now the devices array is ready to use */
+  for(i=0; i<data->nr_devices; i++) {
+    struct hwloc_opencl_device_info_s *info = &data->devices[i];
+    hwloc_obj_t osdev;
+    char buffer[64];
+
+    assert(info->type == HWLOC_OPENCL_DEVICE_AMD);
+    if (info->specific.amd.pcidomain != pcidev->attr->pcidev.domain)
+      continue;
+    if (info->specific.amd.pcibus != pcidev->attr->pcidev.bus)
+      continue;
+    if (info->specific.amd.pcidev != pcidev->attr->pcidev.dev)
+      continue;
+    if (info->specific.amd.pcifunc != pcidev->attr->pcidev.func)
+      continue;
+
+    osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+    snprintf(buffer, sizeof(buffer), "opencl%dd%d", info->platformidx, info->platformdeviceidx);
+    osdev->name = strdup(buffer);
+    osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
+    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC;
+
+    hwloc_obj_add_info(osdev, "CoProcType", "OpenCL");
+    hwloc_obj_add_info(osdev, "Backend", "OpenCL");
+    hwloc_obj_add_info(osdev, "OpenCLDeviceType", info->devicetype);
+
+    if (info->devicevendor[0] != '\0')
+      hwloc_obj_add_info(osdev, "GPUVendor", info->devicevendor);
+    if (info->devicename[0] != '\0')
+      hwloc_obj_add_info(osdev, "GPUModel", info->devicename);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->platformidx);
+    hwloc_obj_add_info(osdev, "OpenCLPlatformIndex", buffer);
+    if (info->platformname[0] != '\0')
+      hwloc_obj_add_info(osdev, "OpenCLPlatformName", info->platformname);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->platformdeviceidx);
+    hwloc_obj_add_info(osdev, "OpenCLPlatformDeviceIndex", buffer);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->computeunits);
+    hwloc_obj_add_info(osdev, "OpenCLComputeUnits", buffer);
+
+    snprintf(buffer, sizeof(buffer), "%llu", info->globalmemsize);
+    hwloc_obj_add_info(osdev, "OpenCLGlobalMemorySize", buffer);
+
+    hwloc_insert_object_by_parent(topology, pcidev, osdev);
+    return 1;
+  }
+
+  return 0;
+}
+
+static void
+hwloc_opencl_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_opencl_backend_data_s *data = backend->private_data;
+  free(data->devices);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_opencl_component_instantiate(struct hwloc_disc_component *component,
+				   const void *_data1 __hwloc_attribute_unused,
+				   const void *_data2 __hwloc_attribute_unused,
+				   const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_opencl_backend_data_s *data;
+
+  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    free(backend);
+    return NULL;
+  }
+  /* the first callback will initialize those */
+  data->nr_devices = (unsigned) -1; /* unknown yet */
+  data->devices = NULL;
+
+  backend->private_data = data;
+  backend->disable = hwloc_opencl_backend_disable;
+
+  backend->notify_new_object = hwloc_opencl_backend_notify_new_object;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_opencl_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC,
+  "opencl",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_opencl_component_instantiate,
+  10, /* after pci */
+  NULL
+};
+
+static int
+hwloc_opencl_component_init(unsigned long flags)
+{
+  if (flags)
+    return -1;
+  if (hwloc_plugin_check_namespace("opencl", "hwloc_backend_alloc") < 0)
+    return -1;
+  return 0;
+}
+
+#ifdef HWLOC_INSIDE_PLUGIN
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_opencl_component;
+#endif
+
+const struct hwloc_component hwloc_opencl_component = {
+  HWLOC_COMPONENT_ABI,
+  hwloc_opencl_component_init, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_opencl_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-osf.cb b/ext/hwloc/hwloc/topology-osf.cb
new file mode 100644
index 0000000..5715888
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-osf.cb
@@ -0,0 +1,392 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <numa.h>
+#include <radset.h>
+#include <cpuset.h>
+#include <sys/mman.h>
+
+/*
+ * TODO
+ *
+ * nsg_init(), nsg_attach_pid(), RAD_MIGRATE/RAD_WAIT
+ * assign_pid_to_pset()
+ *
+ * pthread_use_only_cpu too?
+ */
+
+static int
+prepare_radset(hwloc_topology_t topology __hwloc_attribute_unused, radset_t *radset, hwloc_const_bitmap_t hwloc_set)
+{
+  unsigned cpu;
+  cpuset_t target_cpuset;
+  cpuset_t cpuset, xor_cpuset;
+  radid_t radid;
+  int ret = 0;
+  int ret_errno = 0;
+  int nbnodes = rad_get_num();
+
+  cpusetcreate(&target_cpuset);
+  cpuemptyset(target_cpuset);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    cpuaddset(target_cpuset, cpu);
+  hwloc_bitmap_foreach_end();
+
+  cpusetcreate(&cpuset);
+  cpusetcreate(&xor_cpuset);
+  for (radid = 0; radid < nbnodes; radid++) {
+    cpuemptyset(cpuset);
+    if (rad_get_cpus(radid, cpuset)==-1) {
+      fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+      continue;
+    }
+    cpuxorset(target_cpuset, cpuset, xor_cpuset);
+    if (cpucountset(xor_cpuset) == 0) {
+      /* Found it */
+      radsetcreate(radset);
+      rademptyset(*radset);
+      radaddset(*radset, radid);
+      ret = 1;
+      goto out;
+    }
+  }
+  /* radset containing exactly this set of CPUs not found */
+  ret_errno = EXDEV;
+
+out:
+  cpusetdestroy(&target_cpuset);
+  cpusetdestroy(&cpuset);
+  cpusetdestroy(&xor_cpuset);
+  errno = ret_errno;
+  return ret;
+}
+
+/* Note: get_cpubind not available on OSF */
+
+static int
+hwloc_osf_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  radset_t radset;
+
+  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+    if ((errno = pthread_rad_detach(thread)))
+      return -1;
+    return 0;
+  }
+
+  /* Apparently OSF migrates pages */
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!prepare_radset(topology, &radset, hwloc_set))
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    if ((errno = pthread_rad_bind(thread, radset, RAD_INSIST | RAD_WAIT)))
+      return -1;
+  } else {
+    if ((errno = pthread_rad_attach(thread, radset, RAD_WAIT)))
+      return -1;
+  }
+  radsetdestroy(&radset);
+
+  return 0;
+}
+
+static int
+hwloc_osf_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  radset_t radset;
+
+  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+    if (rad_detach_pid(pid))
+      return -1;
+    return 0;
+  }
+
+  /* Apparently OSF migrates pages */
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!prepare_radset(topology, &radset, hwloc_set))
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    if (rad_bind_pid(pid, radset, RAD_INSIST | RAD_WAIT))
+      return -1;
+  } else {
+    if (rad_attach_pid(pid, radset, RAD_WAIT))
+      return -1;
+  }
+  radsetdestroy(&radset);
+
+  return 0;
+}
+
+static int
+hwloc_osf_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_osf_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_osf_set_proc_cpubind(topology, getpid(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_prepare_mattr(hwloc_topology_t topology __hwloc_attribute_unused, memalloc_attr_t *mattr, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags __hwloc_attribute_unused)
+{
+  unsigned long osf_policy;
+  int node;
+
+  switch (policy) {
+    case HWLOC_MEMBIND_FIRSTTOUCH:
+      osf_policy = MPOL_THREAD;
+      break;
+    case HWLOC_MEMBIND_DEFAULT:
+    case HWLOC_MEMBIND_BIND:
+      osf_policy = MPOL_DIRECTED;
+      break;
+    case HWLOC_MEMBIND_INTERLEAVE:
+      osf_policy = MPOL_STRIPPED;
+      break;
+    case HWLOC_MEMBIND_REPLICATE:
+      osf_policy = MPOL_REPLICATED;
+      break;
+    default:
+      errno = ENOSYS;
+      return -1;
+  }
+
+  memset(mattr, 0, sizeof(*mattr));
+  mattr->mattr_policy = osf_policy;
+  mattr->mattr_rad = RAD_NONE;
+  radsetcreate(&mattr->mattr_radset);
+  rademptyset(mattr->mattr_radset);
+
+  hwloc_bitmap_foreach_begin(node, nodeset)
+    radaddset(mattr->mattr_radset, node);
+  hwloc_bitmap_foreach_end();
+  return 0;
+}
+
+static int
+hwloc_osf_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  memalloc_attr_t mattr;
+  int behavior = 0;
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE)
+    behavior |= MADV_CURRENT;
+  if (flags & HWLOC_MEMBIND_STRICT)
+    behavior |= MADV_INSIST;
+
+  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+    return -1;
+
+  ret = nmadvise(addr, len, MADV_CURRENT, &mattr);
+  radsetdestroy(&mattr.mattr_radset);
+  return ret;
+}
+
+static void *
+hwloc_osf_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  memalloc_attr_t mattr;
+  void *ptr;
+
+  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+    return hwloc_alloc_or_fail(topology, len, flags);
+
+  /* TODO: rather use acreate/amalloc ? */
+  ptr = nmmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+               0, &mattr);
+  radsetdestroy(&mattr.mattr_radset);
+  return ptr;
+}
+
+static int
+hwloc_look_osf(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  cpu_cursor_t cursor;
+  unsigned nbnodes;
+  radid_t radid, radid2;
+  radset_t radset, radset2;
+  cpuid_t cpuid;
+  cpuset_t cpuset;
+  struct hwloc_obj *obj;
+  unsigned distance;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  nbnodes = rad_get_num();
+
+  cpusetcreate(&cpuset);
+  radsetcreate(&radset);
+  radsetcreate(&radset2);
+  {
+    hwloc_obj_t *nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+    unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+    float *distances = calloc(nbnodes*nbnodes, sizeof(float));
+    unsigned nfound;
+    numa_attr_t attr;
+
+    attr.nattr_type = R_RAD;
+    attr.nattr_descr.rd_radset = radset;
+    attr.nattr_flags = 0;
+
+    for (radid = 0; radid < (radid_t) nbnodes; radid++) {
+      rademptyset(radset);
+      radaddset(radset, radid);
+      cpuemptyset(cpuset);
+      if (rad_get_cpus(radid, cpuset)==-1) {
+	fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+	continue;
+      }
+
+      indexes[radid] = radid;
+      nodes[radid] = obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, radid);
+      obj->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(obj->nodeset, radid);
+      obj->cpuset = hwloc_bitmap_alloc();
+      obj->memory.local_memory = rad_get_physmem(radid) * hwloc_getpagesize();
+      obj->memory.page_types_len = 2;
+      obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+      memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+      obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+      obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+
+      cursor = SET_CURSOR_INIT;
+      while((cpuid = cpu_foreach(cpuset, 0, &cursor)) != CPU_NONE)
+	hwloc_bitmap_set(obj->cpuset, cpuid);
+
+      hwloc_debug_1arg_bitmap("node %d has cpuset %s\n",
+		 radid, obj->cpuset);
+
+      hwloc_insert_object_by_cpuset(topology, obj);
+
+      nfound = 0;
+      for (radid2 = 0; radid2 < (radid_t) nbnodes; radid2++)
+	distances[radid*nbnodes+radid2] = RAD_DIST_REMOTE;
+      for (distance = RAD_DIST_LOCAL; distance < RAD_DIST_REMOTE; distance++) {
+	attr.nattr_distance = distance;
+	/* get set of NUMA nodes at distance <= DISTANCE */
+	if (nloc(&attr, radset2)) {
+	  fprintf(stderr,"nloc failed: %s\n", strerror(errno));
+	  continue;
+	}
+	cursor = SET_CURSOR_INIT;
+	while ((radid2 = rad_foreach(radset2, 0, &cursor)) != RAD_NONE) {
+	  if (distances[radid*nbnodes+radid2] == RAD_DIST_REMOTE) {
+            distances[radid*nbnodes+radid2] = (float) distance;
+	    nfound++;
+	  }
+	}
+	if (nfound == nbnodes)
+	  /* Finished finding distances, no need to go up to RAD_DIST_REMOTE */
+	  break;
+      }
+    }
+
+    hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+  }
+  radsetdestroy(&radset2);
+  radsetdestroy(&radset);
+  cpusetdestroy(&cpuset);
+
+  /* add PU objects */
+  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "OSF");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_osf_hooks(struct hwloc_binding_hooks *hooks,
+		    struct hwloc_topology_support *support)
+{
+  hooks->set_thread_cpubind = hwloc_osf_set_thread_cpubind;
+  hooks->set_thisthread_cpubind = hwloc_osf_set_thisthread_cpubind;
+  hooks->set_proc_cpubind = hwloc_osf_set_proc_cpubind;
+  hooks->set_thisproc_cpubind = hwloc_osf_set_thisproc_cpubind;
+  hooks->set_area_membind = hwloc_osf_set_area_membind;
+  hooks->alloc_membind = hwloc_osf_alloc_membind;
+  hooks->alloc = hwloc_alloc_mmap;
+  hooks->free_membind = hwloc_free_mmap;
+  support->membind->firsttouch_membind = 1;
+  support->membind->bind_membind = 1;
+  support->membind->interleave_membind = 1;
+  support->membind->replicate_membind = 1;
+}
+
+static struct hwloc_backend *
+hwloc_osf_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_osf;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_osf_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "osf",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_osf_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_osf_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_osf_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-synthetic.c b/ext/hwloc/hwloc/topology-synthetic.c
new file mode 100644
index 0000000..237729a
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-synthetic.c
@@ -0,0 +1,1128 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <assert.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+struct hwloc_synthetic_level_data_s {
+  unsigned arity;
+  unsigned long totalwidth;
+  hwloc_obj_type_t type;
+  unsigned depth; /* For caches/groups */
+  hwloc_obj_cache_type_t cachetype; /* For caches */
+  hwloc_uint64_t memorysize; /* For caches/memory */
+
+  /* the indexes= attribute before parsing */
+  const char *index_string;
+  unsigned long index_string_length;
+  /* the array of explicit indexes after parsing */
+  unsigned *index_array;
+
+  /* used while filling the topology */
+  unsigned next_os_index; /* id of the next object for that level */
+};
+
+struct hwloc_synthetic_backend_data_s {
+  /* synthetic backend parameters */
+  char *string;
+#define HWLOC_SYNTHETIC_MAX_DEPTH 128
+  struct hwloc_synthetic_level_data_s level[HWLOC_SYNTHETIC_MAX_DEPTH];
+};
+
+struct hwloc_synthetic_intlv_loop_s {
+  unsigned step;
+  unsigned nb;
+  unsigned level_depth;
+};
+
+static void
+hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *data,
+				      unsigned curleveldepth,
+				      int verbose)
+{
+  struct hwloc_synthetic_level_data_s *curlevel = &data->level[curleveldepth];
+  unsigned long total = curlevel->totalwidth;
+  const char *attr = curlevel->index_string;
+  unsigned long length = curlevel->index_string_length;
+  unsigned *array = NULL;
+  struct hwloc_synthetic_intlv_loop_s * loops = NULL;
+  unsigned long i;
+
+  if (!attr)
+    return;
+
+  array = calloc(total, sizeof(*array));
+  if (!array) {
+    if (verbose)
+      fprintf(stderr, "Failed to allocate synthetic index array of size %lu\n", total);
+    goto out;
+  }
+
+  i = strspn(attr, "0123456789,");
+  if (i == length) {
+    /* explicit array of indexes */
+
+    for(i=0; i<total; i++) {
+      const char *next;
+      unsigned idx = strtoul(attr, (char **) &next, 10);
+      if (next == attr) {
+	if (verbose)
+	  fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", i, attr);
+	goto out_with_array;
+      }
+
+      array[i] = idx;
+      if (i != total-1) {
+	if (*next != ',') {
+	  if (verbose)
+	    fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", i, attr);
+	  goto out_with_array;
+	}
+	attr = next+1;
+      } else {
+	attr = next;
+      }
+    }
+    curlevel->index_array = array;
+
+  } else {
+    /* interleaving */
+    unsigned nr_loops = 1, cur_loop;
+    unsigned minstep = total;
+    unsigned long nbs = 1;
+    unsigned j, mul;
+    const char *tmp;
+
+    tmp = attr;
+    while (tmp) {
+      tmp = strchr(tmp, ':');
+      if (!tmp || tmp >= attr+length)
+	break;
+      nr_loops++;
+      tmp++;
+    }
+    /* nr_loops colon-separated fields, but we may need one more at the end */
+    loops = malloc((nr_loops+1)*sizeof(*loops));
+    if (!loops) {
+      if (verbose)
+	fprintf(stderr, "Failed to allocate synthetic index interleave loop array of size %u\n", nr_loops);
+      goto out_with_array;
+    }
+
+    if (*attr >= '0' && *attr <= '9') {
+      /* interleaving as x*y:z*t:... */
+      unsigned step, nb;
+
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	char *tmp2, *tmp3;
+	step = (unsigned) strtol(tmp, &tmp2, 0);
+	if (tmp2 == tmp || *tmp2 != '*') {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (!step) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	tmp2++;
+	nb = (unsigned) strtol(tmp2, &tmp3, 0);
+	if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (!nb) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2);
+	  goto out_with_loops;
+	}
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+	cur_loop++;
+	if (*tmp3 == ')' || *tmp3 == ' ')
+	  break;
+	tmp = (const char*) (tmp3+1);
+      }
+
+    } else {
+      /* interleaving as type1:type2:... */
+      hwloc_obj_type_t type;
+      hwloc_obj_cache_type_t cachetypeattr;
+      int depthattr;
+      int err;
+
+      /* find level depths for each interleaving loop */
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	err = hwloc_obj_type_sscanf(tmp, &type, &depthattr, &cachetypeattr, sizeof(cachetypeattr));
+	if (err < 0) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	  if (verbose)
+	    fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	for(i=0; i<curleveldepth; i++) {
+	  if (type != data->level[i].type)
+	    continue;
+	  if ((type == HWLOC_OBJ_GROUP || type == HWLOC_OBJ_CACHE)
+	      && depthattr != -1
+	      && (unsigned) depthattr != data->level[i].depth)
+	    continue;
+	  if (type == HWLOC_OBJ_CACHE
+	      && cachetypeattr != (hwloc_obj_cache_type_t) -1
+	      && cachetypeattr != data->level[i].cachetype)
+	    continue;
+	  loops[cur_loop].level_depth = i;
+	  break;
+	}
+	if (i == curleveldepth) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s' above '%s'\n",
+		    tmp, hwloc_obj_type_string(curlevel->type));
+	  goto out_with_loops;
+	}
+	tmp = strchr(tmp, ':');
+	if (!tmp || tmp > attr+length)
+	  break;
+	tmp++;
+	cur_loop++;
+      }
+
+      /* compute actual loop step/nb */
+      for(cur_loop=0; cur_loop<nr_loops; cur_loop++) {
+	unsigned mydepth = loops[cur_loop].level_depth;
+	unsigned prevdepth = 0;
+	unsigned step, nb;
+	for(i=0; i<nr_loops; i++) {
+	  if (loops[i].level_depth == mydepth && i != cur_loop) {
+	    if (verbose)
+	      fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr);
+	    goto out_with_loops;
+	  }
+	  if (loops[i].level_depth < mydepth
+	      && loops[i].level_depth > prevdepth)
+	    prevdepth = loops[i].level_depth;
+	}
+	step = curlevel->totalwidth / data->level[mydepth].totalwidth; /* number of objects below us */
+	nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */
+
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	assert(nb);
+	assert(step);
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+      }
+    }
+    assert(nbs);
+
+    if (nbs != total) {
+      /* one loop of total/nbs steps is missing, add it if it's just the smallest one */
+      if (minstep == total/nbs) {
+	loops[nr_loops].step = 1;
+	loops[nr_loops].nb = total/nbs;
+	nr_loops++;
+      } else {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total);
+	goto out_with_loops;
+      }
+    }
+
+    /* generate the array of indexes */
+    mul = 1;
+    for(i=0; i<nr_loops; i++) {
+      unsigned step = loops[i].step;
+      unsigned nb = loops[i].nb;
+      for(j=0; j<total; j++)
+	array[j] += ((j / step) % nb) * mul;
+      mul *= nb;
+    }
+
+    /* check that we have the right values (cannot pass total, cannot give duplicate 0) */
+    for(j=0; j<total; j++) {
+      if (array[j] >= total) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]);
+	goto out_with_loops;
+      }
+      if (!array[j] && j) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates duplicate index values\n");
+	goto out_with_loops;
+      }
+    }
+
+    free(loops);
+    curlevel->index_array = array;
+  }
+
+  return;
+
+ out_with_loops:
+  free(loops);
+ out_with_array:
+  free(array);
+ out:
+  return;
+}
+
+static hwloc_uint64_t
+hwloc_synthetic_parse_memory_attr(const char *attr, const char **endp)
+{
+  const char *endptr;
+  hwloc_uint64_t size;
+  size = strtoull(attr, (char **) &endptr, 0);
+  if (!hwloc_strncasecmp(endptr, "TB", 2)) {
+    size <<= 40;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "GB", 2)) {
+    size <<= 30;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "MB", 2)) {
+    size <<= 20;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "kB", 2)) {
+    size <<= 10;
+    endptr += 2;
+  }
+  *endp = endptr;
+  return size;
+}
+
+static int
+hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
+				  struct hwloc_synthetic_level_data_s *curlevel,
+				  int verbose)
+{
+  hwloc_obj_type_t type = curlevel->type;
+  const char *next_pos;
+  hwloc_uint64_t memorysize = 0;
+  const char *index_string = NULL;
+  unsigned long index_string_length = 0;
+
+  next_pos = (const char *) strchr(attrs, ')');
+  if (!next_pos) {
+    if (verbose)
+      fprintf(stderr, "Missing attribute closing bracket in synthetic string doesn't have a number of objects at '%s'\n", attrs);
+    errno = EINVAL;
+    return -1;
+  }
+
+  while (')' != *attrs) {
+    if (HWLOC_OBJ_CACHE == type && !strncmp("size=", attrs, 5)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+5, &attrs);
+
+    } else if (HWLOC_OBJ_CACHE != type && !strncmp("memory=", attrs, 7)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+7, &attrs);
+
+    } else if (!strncmp("indexes=", attrs, 8)) {
+      index_string = attrs+8;
+      attrs += 8;
+      index_string_length = strcspn(attrs, " )");
+      attrs += index_string_length;
+
+    } else {
+      if (verbose)
+	fprintf(stderr, "Unknown attribute at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+
+    if (' ' == *attrs)
+      attrs++;
+    else if (')' != *attrs) {
+      if (verbose)
+	fprintf(stderr, "Missing parameter separator at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+  }
+
+  curlevel->memorysize = memorysize;
+  curlevel->index_string = index_string;
+  curlevel->index_string_length = index_string_length;
+  *next_posp = next_pos+1;
+  return 0;
+}
+
+/* Read from description a series of integers describing a symmetrical
+   topology and update the hwloc_synthetic_backend_data_s accordingly.  On
+   success, return zero.  */
+static int
+hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
+			     const char *description)
+{
+  const char *pos, *next_pos;
+  unsigned long item, count;
+  unsigned i;
+  int cache_depth = 0, group_depth = 0;
+  int nb_machine_levels = 0, nb_node_levels = 0;
+  int nb_pu_levels = 0;
+  int verbose = 0;
+  const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
+  int err;
+  unsigned long totalarity = 1;
+
+  if (env)
+    verbose = atoi(env);
+
+  /* default values before we add root attributes */
+  data->level[0].totalwidth = 1;
+  data->level[0].type = HWLOC_OBJ_MACHINE;
+  data->level[0].index_string = NULL;
+  data->level[0].index_array = NULL;
+  data->level[0].memorysize = 0;
+  if (*description == '(') {
+    err = hwloc_synthetic_parse_level_attrs(description+1, &description, &data->level[0], verbose);
+    if (err < 0)
+      return err;
+  }
+
+  for (pos = description, count = 1; *pos; pos = next_pos) {
+#define HWLOC_OBJ_TYPE_UNKNOWN ((hwloc_obj_type_t) -1)
+    hwloc_obj_type_t type = HWLOC_OBJ_TYPE_UNKNOWN;
+    int typedepth = -1;
+    hwloc_obj_cache_type_t cachetype = (hwloc_obj_cache_type_t) -1;
+
+    /* initialize parent arity to 0 so that the levels are not infinite */
+    data->level[count-1].arity = 0;
+
+    while (*pos == ' ')
+      pos++;
+
+    if (!*pos)
+      break;
+
+    if (*pos < '0' || *pos > '9') {
+      if (hwloc_obj_type_sscanf(pos, &type, &typedepth, &cachetype, sizeof(cachetype)) < 0) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with disallowed object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+
+      next_pos = strchr(pos, ':');
+      if (!next_pos) {
+	if (verbose)
+	  fprintf(stderr,"Synthetic string doesn't have a `:' after object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      pos = next_pos + 1;
+    }
+    data->level[count].type = type;
+    data->level[count].depth = (unsigned) typedepth;
+    data->level[count].cachetype = cachetype;
+
+    item = strtoul(pos, (char **)&next_pos, 0);
+    if (next_pos == pos) {
+      if (verbose)
+	fprintf(stderr,"Synthetic string doesn't have a number of objects at '%s'\n", pos);
+      errno = EINVAL;
+      goto error;
+    }
+    data->level[count-1].arity = (unsigned)item;
+
+    totalarity *= item;
+    data->level[count].totalwidth = totalarity;
+    data->level[count].index_string = NULL;
+    data->level[count].index_array = NULL;
+    data->level[count].memorysize = 0;
+    if (*next_pos == '(') {
+      err = hwloc_synthetic_parse_level_attrs(next_pos+1, &next_pos, &data->level[count], verbose);
+      if (err < 0)
+	goto error;
+    }
+
+    if (count + 1 >= HWLOC_SYNTHETIC_MAX_DEPTH) {
+      if (verbose)
+	fprintf(stderr,"Too many synthetic levels, max %d\n", HWLOC_SYNTHETIC_MAX_DEPTH);
+      errno = EINVAL;
+      goto error;
+    }
+    if (item > UINT_MAX) {
+      if (verbose)
+	fprintf(stderr,"Too big arity, max %u\n", UINT_MAX);
+      errno = EINVAL;
+      goto error;
+    }
+
+    count++;
+  }
+
+  if (count <= 0) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string doesn't contain any object\n");
+    errno = EINVAL;
+    goto error;
+  }
+
+  for(i=count-1; i>0; i--) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    hwloc_obj_type_t type;
+
+    type = curlevel->type;
+
+    if (type == HWLOC_OBJ_TYPE_UNKNOWN) {
+      if (i == count-1)
+	type = HWLOC_OBJ_PU;
+      else {
+	switch (data->level[i+1].type) {
+	case HWLOC_OBJ_PU: type = HWLOC_OBJ_CORE; break;
+	case HWLOC_OBJ_CORE: type = HWLOC_OBJ_CACHE; break;
+	case HWLOC_OBJ_CACHE: type = HWLOC_OBJ_PACKAGE; break;
+	case HWLOC_OBJ_PACKAGE: type = HWLOC_OBJ_NUMANODE; break;
+	case HWLOC_OBJ_NUMANODE:
+	case HWLOC_OBJ_MACHINE:
+	case HWLOC_OBJ_GROUP: type = HWLOC_OBJ_GROUP; break;
+	default:
+	  assert(0);
+	}
+      }
+      curlevel->type = type;
+    }
+    switch (type) {
+      case HWLOC_OBJ_PU:
+	nb_pu_levels++;
+	break;
+      case HWLOC_OBJ_CACHE:
+	cache_depth++;
+	break;
+      case HWLOC_OBJ_GROUP:
+	group_depth++;
+	break;
+      case HWLOC_OBJ_NUMANODE:
+	nb_node_levels++;
+	break;
+      case HWLOC_OBJ_MACHINE:
+	nb_machine_levels++;
+	break;
+      default:
+	break;
+    }
+  }
+
+  if (!nb_pu_levels) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string missing ending number of PUs\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_pu_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several PU levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_node_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several NUMA node levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_machine_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several machine levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (nb_machine_levels)
+    data->level[0].type = HWLOC_OBJ_SYSTEM;
+  else {
+    data->level[0].type = HWLOC_OBJ_MACHINE;
+    nb_machine_levels++;
+  }
+
+  /* enforce a NUMA level */
+  if (!nb_node_levels) {
+    /* insert a NUMA level and the machine level */
+    if (data->level[1].type == HWLOC_OBJ_MACHINE)
+      /* there's an explicit machine level after the automatic system root, insert below both */
+      i = 2;
+    else
+      /* insert below the automatic machine root */
+      i = 1;
+    if (verbose)
+      fprintf(stderr, "Inserting a NUMA level with a single object at depth %u\n", i);
+    /* move existing levels by one */
+    memmove(&data->level[i+1], &data->level[i], (count*i)*sizeof(struct hwloc_synthetic_level_data_s));
+    data->level[i].type = HWLOC_OBJ_NUMANODE;
+    data->level[i].index_string = NULL;
+    data->level[i].index_array = NULL;
+    data->level[i].memorysize = 0;
+    data->level[i].totalwidth = data->level[i-1].totalwidth;
+    /* update arity to insert a single NUMA node per parent */
+    data->level[i].arity = data->level[i-1].arity;
+    data->level[i-1].arity = 1;
+    count++;
+  }
+
+  if (cache_depth == 1)
+    /* if there is a single cache level, make it L2 */
+    cache_depth = 2;
+
+  for (i=0; i<count; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    hwloc_obj_type_t type = curlevel->type;
+
+    if (type == HWLOC_OBJ_GROUP) {
+      if (curlevel->depth == (unsigned)-1)
+	curlevel->depth = group_depth--;
+
+    } else if (type == HWLOC_OBJ_CACHE) {
+      if (curlevel->depth == (unsigned)-1)
+	curlevel->depth = cache_depth--;
+      if (curlevel->cachetype == (hwloc_obj_cache_type_t) -1)
+	curlevel->cachetype = curlevel->depth == 1 ? HWLOC_OBJ_CACHE_DATA : HWLOC_OBJ_CACHE_UNIFIED;
+      if (!curlevel->memorysize) {
+	if (1 == curlevel->depth)
+	  /* 32Kb in L1 */
+	  curlevel->memorysize = 32*1024;
+	else
+	  /* *4 at each level, starting from 1MB for L2, unified */
+	  curlevel->memorysize = 256*1024 << (2*curlevel->depth);
+      }
+
+    } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->memorysize) {
+      /* 1GB in memory nodes. */
+      curlevel->memorysize = 1024*1024*1024;
+    }
+
+    hwloc_synthetic_process_level_indexes(data, i, verbose);
+  }
+
+  data->string = strdup(description);
+  data->level[count-1].arity = 0;
+  return 0;
+
+ error:
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    free(curlevel->index_array);
+    if (!curlevel->arity)
+      break;
+  }
+  return -1;
+}
+
+static void
+hwloc_synthetic__post_look_hooks(struct hwloc_synthetic_level_data_s *curlevel,
+				 hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  case HWLOC_OBJ_GROUP:
+    obj->attr->group.depth = curlevel->depth;
+    break;
+  case HWLOC_OBJ_SYSTEM:
+    break;
+  case HWLOC_OBJ_MACHINE:
+    break;
+  case HWLOC_OBJ_NUMANODE:
+    break;
+  case HWLOC_OBJ_PACKAGE:
+    break;
+  case HWLOC_OBJ_CACHE:
+    obj->attr->cache.depth = curlevel->depth;
+    obj->attr->cache.linesize = 64;
+    obj->attr->cache.type = curlevel->cachetype;
+    obj->attr->cache.size = curlevel->memorysize;
+    break;
+  case HWLOC_OBJ_CORE:
+    break;
+  case HWLOC_OBJ_PU:
+    break;
+  case HWLOC_OBJ_BRIDGE:
+  case HWLOC_OBJ_PCI_DEVICE:
+  case HWLOC_OBJ_OS_DEVICE:
+  case HWLOC_OBJ_MISC:
+  case HWLOC_OBJ_TYPE_MAX:
+    /* Should never happen */
+    assert(0);
+    break;
+  }
+  if (curlevel->memorysize && HWLOC_OBJ_CACHE != obj->type) {
+    obj->memory.local_memory = curlevel->memorysize;
+    obj->memory.page_types_len = 1;
+    obj->memory.page_types = malloc(sizeof(*obj->memory.page_types));
+    memset(obj->memory.page_types, 0, sizeof(*obj->memory.page_types));
+    obj->memory.page_types[0].size = 4096;
+    obj->memory.page_types[0].count = curlevel->memorysize / 4096;
+  }
+}
+
+/*
+ * Recursively build objects whose cpu start at first_cpu
+ * - level gives where to look in the type, arity and id arrays
+ * - the id array is used as a variable to get unique IDs for a given level.
+ * - generated memory should be added to *memory_kB.
+ * - generated cpus should be added to parent_cpuset.
+ * - next cpu number to be used should be returned.
+ */
+static void
+hwloc__look_synthetic(struct hwloc_topology *topology,
+		      struct hwloc_synthetic_backend_data_s *data,
+		      int level,
+		      hwloc_bitmap_t parent_cpuset)
+{
+  hwloc_obj_t obj;
+  unsigned i;
+  struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
+  hwloc_obj_type_t type = curlevel->type;
+  unsigned os_index;
+
+  /* pre-hooks */
+  switch (type) {
+    case HWLOC_OBJ_GROUP:
+      break;
+    case HWLOC_OBJ_MACHINE:
+      break;
+    case HWLOC_OBJ_NUMANODE:
+      break;
+    case HWLOC_OBJ_PACKAGE:
+      break;
+    case HWLOC_OBJ_CACHE:
+      break;
+    case HWLOC_OBJ_CORE:
+      break;
+    case HWLOC_OBJ_PU:
+      break;
+    case HWLOC_OBJ_SYSTEM:
+    case HWLOC_OBJ_BRIDGE:
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_OS_DEVICE:
+    case HWLOC_OBJ_MISC:
+    case HWLOC_OBJ_TYPE_MAX:
+      /* Should never happen */
+      assert(0);
+      break;
+  }
+
+  os_index = curlevel->next_os_index++;
+  if (curlevel->index_array)
+    os_index = curlevel->index_array[os_index];
+  obj = hwloc_alloc_setup_object(type, os_index);
+  obj->cpuset = hwloc_bitmap_alloc();
+
+  if (!curlevel->arity) {
+    hwloc_bitmap_set(obj->cpuset, os_index);
+  } else {
+    for (i = 0; i < curlevel->arity; i++)
+      hwloc__look_synthetic(topology, data, level + 1, obj->cpuset);
+  }
+
+  if (type == HWLOC_OBJ_NUMANODE) {
+    obj->nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(obj->nodeset, os_index);
+  }
+
+  hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);
+
+  hwloc_synthetic__post_look_hooks(curlevel, obj);
+
+  hwloc_insert_object_by_cpuset(topology, obj);
+}
+
+static int
+hwloc_look_synthetic(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  unsigned i;
+
+  assert(!topology->levels[0][0]->cpuset);
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  topology->support.discovery->pu = 1;
+
+  /* start with os_index 0 for each level */
+  for (i = 0; data->level[i].arity > 0; i++)
+    data->level[i].next_os_index = 0;
+  /* ... including the last one */
+  data->level[i].next_os_index = 0;
+
+  /* update first level type according to the synthetic type array */
+  topology->levels[0][0]->type = data->level[0].type;
+  hwloc_synthetic__post_look_hooks(&data->level[0], topology->levels[0][0]);
+
+  for (i = 0; i < data->level[0].arity; i++)
+    hwloc__look_synthetic(topology, data, 1, cpuset);
+
+  hwloc_bitmap_free(cpuset);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Synthetic");
+  hwloc_obj_add_info(topology->levels[0][0], "SyntheticDescription", data->string);
+  return 1;
+}
+
+static void
+hwloc_synthetic_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  unsigned i;
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    free(curlevel->index_array);
+    if (!curlevel->arity)
+      break;
+  }
+  free(data->string);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
+				      const void *_data1,
+				      const void *_data2 __hwloc_attribute_unused,
+				      const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_synthetic_backend_data_s *data;
+  int err;
+
+  if (!_data1) {
+    const char *env = getenv("HWLOC_SYNTHETIC");
+    if (env) {
+      /* 'synthetic' was given in HWLOC_COMPONENTS without a description */
+      _data1 = env;
+    } else {
+      errno = EINVAL;
+      goto out;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  err = hwloc_backend_synthetic_init(data, (const char *) _data1);
+  if (err < 0)
+    goto out_with_data;
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_synthetic;
+  backend->disable = hwloc_synthetic_backend_disable;
+  backend->is_thissystem = 0;
+
+  return backend;
+
+ out_with_data:
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_synthetic_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "synthetic",
+  ~0,
+  hwloc_synthetic_component_instantiate,
+  30,
+  NULL
+};
+
+const struct hwloc_component hwloc_synthetic_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_synthetic_disc_component
+};
+
+static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topology,
+						   hwloc_obj_t obj,
+						   char *buffer, size_t buflen)
+{
+  unsigned depth = obj->depth;
+  unsigned total = topology->level_nbobjects[depth];
+  unsigned step = 1;
+  unsigned nr_loops = 0;
+  struct hwloc_synthetic_intlv_loop_s *loops = NULL;
+  hwloc_obj_t cur;
+  unsigned i, j;
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  /* must start with 0 */
+  if (obj->os_index)
+    goto exportall;
+
+  while (step != total) {
+    /* must be a divider of the total */
+    if (total % step)
+      goto exportall;
+
+    /* look for os_index == step */
+    for(i=1; i<total; i++)
+      if (topology->levels[depth][i]->os_index == step)
+	break;
+    if (i == total)
+      goto exportall;
+    for(j=2; j<total/i; j++)
+      if (topology->levels[depth][i*j]->os_index != step*j)
+	break;
+
+    nr_loops++;
+    loops = realloc(loops, nr_loops*sizeof(*loops));
+    if (!loops)
+      goto exportall;
+    loops[nr_loops-1].step = i;
+    loops[nr_loops-1].nb = j;
+    step *= j;
+  }
+
+  /* check this interleaving */
+  for(i=0; i<total; i++) {
+    unsigned ind = 0;
+    unsigned mul = 1;
+    for(j=0; j<nr_loops; j++) {
+      ind += (i / loops[j].step) % loops[j].nb * mul;
+      mul *= loops[j].nb;
+    }
+    if (topology->levels[depth][i]->os_index != ind)
+      goto exportall;
+  }
+
+  /* success, print it */
+  for(j=0; j<nr_loops; j++) {
+    res = hwloc_snprintf(tmp, tmplen, "%u*%u%s", loops[j].step, loops[j].nb,
+			 j == nr_loops-1 ? ")" : ":");
+    if (res < 0) {
+      free(loops);
+      return -1;
+    }
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+  }
+
+  if (loops)
+    free(loops);
+
+  return ret;
+
+ exportall:
+  if (loops)
+    free(loops);
+
+  /* dump all indexes */
+  cur = obj;
+  while (cur) {
+    res = snprintf(tmp, tmplen, "%u%s", cur->os_index,
+		   cur->next_cousin ? "," : ")");
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+    cur = cur->next_cousin;
+  }
+  return ret;
+}
+
+static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topology,
+						    hwloc_obj_t obj,
+						    char *buffer, size_t buflen)
+{
+  const char * separator = " ";
+  const char * prefix = "(";
+  char cachesize[64] = "";
+  char memsize[64] = "";
+  int needindexes = 0;
+
+  if (HWLOC_OBJ_CACHE == obj->type && obj->attr->cache.size) {
+    snprintf(cachesize, sizeof(cachesize), "%ssize=%llu",
+	     prefix, (unsigned long long) obj->attr->cache.size);
+    prefix = separator;
+  }
+  if (obj->memory.local_memory) {
+    snprintf(memsize, sizeof(memsize), "%smemory=%llu",
+	     prefix, (unsigned long long) obj->memory.local_memory);
+    prefix = separator;
+  }
+  if (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE) {
+    hwloc_obj_t cur = obj;
+    while (cur) {
+      if (cur->os_index != cur->logical_index) {
+	needindexes = 1;
+	break;
+      }
+      cur = cur->next_cousin;
+    }
+  }
+  if (*cachesize || *memsize || needindexes) {
+    ssize_t tmplen = buflen;
+    char *tmp = buffer;
+    int res, ret = 0;
+
+    res = hwloc_snprintf(tmp, tmplen, "%s%s%s", cachesize, memsize, needindexes ? "" : ")");
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+
+    if (needindexes) {
+      res = snprintf(tmp, tmplen, "%sindexes=", prefix);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+
+      res = hwloc_topology_export_synthetic_indexes(topology, obj, tmp, tmplen);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+    }
+    return ret;
+  } else {
+    return 0;
+  }
+}
+
+int
+hwloc_topology_export_synthetic(struct hwloc_topology * topology,
+				char *buffer, size_t buflen,
+				unsigned long flags)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+   int arity;
+  const char * separator = " ";
+  const char * prefix = "";
+
+  if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* TODO: add a flag to ignore symmetric_subtree and I/Os.
+   * just assume things are symmetric with the left branches of the tree.
+   * but the number of objects per level may be wrong, what to do with OS index array in this case?
+   * only allow ignoring symmetric_subtree if the level width remains OK?
+   */
+
+  /* TODO: add a root object by default, with a prefix such as tree=
+   * so that we can backward-compatibly recognize whether there's a root or not.
+   * and add a flag to disable it.
+   */
+
+  /* TODO: flag to force all indexes, not only for PU and NUMA? */
+
+  if (!obj->symmetric_subtree) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    /* root attributes */
+    res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (ret > 0)
+      prefix = separator;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+  }
+
+  arity = obj->arity;
+  while (arity) {
+    /* for each level */
+    obj = obj->first_child;
+    if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
+      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, hwloc_obj_type_string(obj->type), arity);
+    } else {
+      char types[64];
+      hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
+      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, types, arity);
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+
+    if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+      /* obj attributes */
+      res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+    }
+
+    /* next level */
+    prefix = separator;
+    arity = obj->arity;
+  }
+
+  return ret;
+}
diff --git a/ext/hwloc/hwloc/topology-x86.c b/ext/hwloc/hwloc/topology-x86.c
new file mode 100644
index 0000000..1234ce4
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-x86.c
@@ -0,0 +1,1386 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2013 Université Bordeaux
+ * Copyright © 2010-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ *
+ * This backend is only used when the operating system does not export
+ * the necessary hardware topology information to user-space applications.
+ * Currently, only the FreeBSD backend relies on this x86 backend.
+ *
+ * Other backends such as Linux have their own way to retrieve various
+ * pieces of hardware topology information from the operating system
+ * on various architectures, without having to use this x86-specific code.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <private/cpuid-x86.h>
+
+#include <sys/types.h>
+#include <dirent.h>
+
+struct hwloc_x86_backend_data_s {
+  unsigned nbprocs;
+  hwloc_bitmap_t apicid_set;
+  int apicid_unique;
+  char *src_cpuiddump_path;
+};
+
+/************************************
+ * Management of cpuid dump as input
+ */
+
+struct cpuiddump {
+  unsigned nr;
+  struct cpuiddump_entry {
+    unsigned inmask; /* which of ine[abcd]x are set on input */
+    unsigned ineax;
+    unsigned inebx;
+    unsigned inecx;
+    unsigned inedx;
+    unsigned outeax;
+    unsigned outebx;
+    unsigned outecx;
+    unsigned outedx;
+  } *entries;
+};
+
+static void
+cpuiddump_free(struct cpuiddump *cpuiddump)
+{
+  if (cpuiddump->nr)
+    free(cpuiddump->entries);
+  free(cpuiddump);
+}
+
+static struct cpuiddump *
+cpuiddump_read(const char *dirpath, unsigned idx)
+{
+  struct cpuiddump *cpuiddump;
+  struct cpuiddump_entry *cur;
+  char *filename;
+  size_t filenamelen = strlen(dirpath) + 15;
+  FILE *file;
+  char line[128];
+  unsigned nr;
+
+  cpuiddump = malloc(sizeof(*cpuiddump));
+  cpuiddump->nr = 0; /* return a cpuiddump that will raise errors because it matches nothing */
+
+  filename = malloc(filenamelen);
+  snprintf(filename, filenamelen, "%s/pu%u", dirpath, idx);
+  file = fopen(filename, "r");
+  if (!file) {
+    fprintf(stderr, "Could not read dumped cpuid file %s\n", filename);
+    free(filename);
+    return cpuiddump;
+  }
+  free(filename);
+
+  nr = 0;
+  while (fgets(line, sizeof(line), file))
+    nr++;
+  cpuiddump->entries = malloc(nr * sizeof(struct cpuiddump_entry));
+
+  fseek(file, 0, SEEK_SET);
+  cur = &cpuiddump->entries[0];
+  nr = 0;
+  while (fgets(line, sizeof(line), file)) {
+    if (*line == '#')
+      continue;
+    if (sscanf(line, "%x %x %x %x %x => %x %x %x %x",
+	      &cur->inmask,
+	      &cur->ineax, &cur->inebx, &cur->inecx, &cur->inedx,
+	      &cur->outeax, &cur->outebx, &cur->outecx, &cur->outedx) == 9) {
+      cur++;
+      nr++;
+    }
+  }
+  cpuiddump->nr = nr;
+  fclose(file);
+  return cpuiddump;
+}
+
+static void
+cpuiddump_find_by_input(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *cpuiddump)
+{
+  unsigned i;
+
+  for(i=0; i<cpuiddump->nr; i++) {
+    struct cpuiddump_entry *entry = &cpuiddump->entries[i];
+    if ((entry->inmask & 0x1) && *eax != entry->ineax)
+      continue;
+    if ((entry->inmask & 0x2) && *ebx != entry->inebx)
+      continue;
+    if ((entry->inmask & 0x4) && *ecx != entry->inecx)
+      continue;
+    if ((entry->inmask & 0x8) && *edx != entry->inedx)
+      continue;
+    *eax = entry->outeax;
+    *ebx = entry->outebx;
+    *ecx = entry->outecx;
+    *edx = entry->outedx;
+    return;
+  }
+
+  fprintf(stderr, "Couldn't find %x,%x,%x,%x in dumped cpuid, returning 0s.\n",
+	  *eax, *ebx, *ecx, *edx);
+  *eax = 0;
+  *ebx = 0;
+  *ecx = 0;
+  *edx = 0;
+}
+
+static void cpuid_or_from_dump(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *src_cpuiddump)
+{
+  if (src_cpuiddump) {
+    cpuiddump_find_by_input(eax, ebx, ecx, edx, src_cpuiddump);
+  } else {
+    hwloc_x86_cpuid(eax, ebx, ecx, edx);
+  }
+}
+
+/*******************************
+ * Core detection routines and structures
+ */
+
+#define has_topoext(features) ((features)[6] & (1 << 22))
+#define has_x2apic(features) ((features)[4] & (1 << 21))
+
+struct cacheinfo {
+  unsigned type;
+  unsigned level;
+  unsigned nbthreads_sharing;
+
+  unsigned linesize;
+  unsigned linepart;
+  int ways;
+  unsigned sets;
+  unsigned long size;
+  char inclusiveness;
+
+};
+
+struct procinfo {
+  unsigned present;
+  unsigned apicid;
+  unsigned max_log_proc;
+  unsigned max_nbcores;
+  unsigned max_nbthreads;
+  unsigned packageid;
+  unsigned nodeid;
+  unsigned unitid;
+  unsigned logprocid;
+  unsigned threadid;
+  unsigned coreid;
+  unsigned *otherids;
+  unsigned levels;
+  unsigned numcaches;
+  struct cacheinfo *cache;
+  char cpuvendor[13];
+  char cpumodel[3*4*4+1];
+  unsigned cpustepping;
+  unsigned cpumodelnumber;
+  unsigned cpufamilynumber;
+};
+
+enum cpuid_type {
+  intel,
+  amd,
+  unknown
+};
+
+static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, unsigned cpuid)
+{
+  struct cacheinfo *cache;
+  unsigned cachenum;
+  unsigned long size = 0;
+
+  if (level == 1)
+    size = ((cpuid >> 24)) << 10;
+  else if (level == 2)
+    size = ((cpuid >> 16)) << 10;
+  else if (level == 3)
+    size = ((cpuid >> 18)) << 19;
+  if (!size)
+    return;
+
+  cachenum = infos->numcaches++;
+  infos->cache = realloc(infos->cache, infos->numcaches*sizeof(*infos->cache));
+  cache = &infos->cache[cachenum];
+
+  cache->type = type;
+  cache->level = level;
+  if (level <= 2)
+    cache->nbthreads_sharing = 1;
+  else
+    cache->nbthreads_sharing = infos->max_log_proc;
+  cache->linesize = cpuid & 0xff;
+  cache->linepart = 0;
+  if (level == 1) {
+    cache->inclusiveness = 0;//get inclusiveness old AMD ( suposed to be L1 false)
+
+    cache->ways = (cpuid >> 16) & 0xff;
+    if (cache->ways == 0xff)
+      /* Fully associative */
+      cache->ways = -1;
+  } else {
+    cache->inclusiveness = 1;//get inclusivenessold AMD ( suposed to be L2 L3 true)
+
+    static const unsigned ways_tab[] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, -1 };
+    unsigned ways = (cpuid >> 12) & 0xf;
+    cache->ways = ways_tab[ways];
+  }
+  cache->size = size;
+  cache->sets = 0;
+
+  hwloc_debug("cache L%u t%u linesize %u ways %u size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
+}
+
+/* Fetch information from the processor itself thanks to cpuid and store it in
+ * infos for summarize to analyze them globally */
+static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned cachenum;
+  struct cacheinfo *cache;
+  unsigned regs[4];
+  unsigned _model, _extendedmodel, _family, _extendedfamily;
+
+  infos->present = 1;
+
+  /* on return from this function, the following fields must be set in infos:
+   * packageid, nodeid, unitid, coreid, threadid, or -1
+   * apicid
+   * levels and levels slots in otherids[]
+   * numcaches and numcaches slots in caches[]
+   *
+   * max_log_proc, max_nbthreads, max_nbcores, logprocid
+   * are only used temporarily inside this function and its callees.
+   */
+
+  /* Get apicid, max_log_proc, packageid, logprocid from cpuid 0x01 */
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  infos->apicid = ebx >> 24;
+  if (edx & (1 << 28))
+    infos->max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
+  else
+    infos->max_log_proc = 1;
+  hwloc_debug("APIC ID 0x%02x max_log_proc %u\n", infos->apicid, infos->max_log_proc);
+  infos->packageid = infos->apicid / infos->max_log_proc;
+  infos->logprocid = infos->apicid % infos->max_log_proc;
+  hwloc_debug("phys %u thread %u\n", infos->packageid, infos->logprocid);
+
+  /* Get cpu model/family/stepping numbers from same cpuid */
+  _model          = (eax>>4) & 0xf;
+  _extendedmodel  = (eax>>16) & 0xf;
+  _family         = (eax>>8) & 0xf;
+  _extendedfamily = (eax>>20) & 0xff;
+  if ((cpuid_type == intel || cpuid_type == amd) && _family == 0xf) {
+    infos->cpufamilynumber = _family + _extendedfamily;
+  } else {
+    infos->cpufamilynumber = _family;
+  }
+  if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
+      || (cpuid_type == amd && _family == 0xf)) {
+    infos->cpumodelnumber = _model + (_extendedmodel << 4);
+  } else {
+    infos->cpumodelnumber = _model;
+  }
+  infos->cpustepping = eax & 0xf;
+
+  /* Get cpu vendor string from cpuid 0x00 */
+  memset(regs, 0, sizeof(regs));
+  regs[0] = 0;
+  cpuid_or_from_dump(&regs[0], &regs[1], &regs[3], &regs[2], src_cpuiddump);
+  memcpy(infos->cpuvendor, regs+1, 4*3);
+  /* infos was calloc'ed, already ends with \0 */
+
+  /* Get cpu model string from cpuid 0x80000002-4 */
+  if (highest_ext_cpuid >= 0x80000004) {
+    memset(regs, 0, sizeof(regs));
+    regs[0] = 0x80000002;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel, regs, 4*4);
+    regs[0] = 0x80000003;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4, regs, 4*4);
+    regs[0] = 0x80000004;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
+    /* infos was calloc'ed, already ends with \0 */
+  }
+
+  /* Get core/thread information from cpuid 0x80000008
+   * (not supported on Intel)
+   */
+  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000008) {
+    unsigned coreidsize;
+    eax = 0x80000008;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    coreidsize = (ecx >> 12) & 0xf;
+    hwloc_debug("core ID size: %u\n", coreidsize);
+    if (!coreidsize) {
+      infos->max_nbcores = (ecx & 0xff) + 1;
+    } else
+      infos->max_nbcores = 1 << coreidsize;
+    hwloc_debug("Thus max # of cores: %u\n", infos->max_nbcores);
+    /* Still no multithreaded AMD */
+    infos->max_nbthreads = 1 ;
+    hwloc_debug("and max # of threads: %u\n", infos->max_nbthreads);
+    /* The legacy max_log_proc is deprecated, it can be smaller than max_nbcores,
+     * which is the maximum number of cores that the processor could theoretically support
+     * (see "Multiple Core Calculation" in the AMD CPUID specification).
+     * Recompute packageid/logprocid/threadid/coreid accordingly.
+     */
+    infos->packageid = infos->apicid / infos->max_nbcores;
+    infos->logprocid = infos->apicid % infos->max_nbcores;
+    infos->threadid = infos->logprocid % infos->max_nbthreads;
+    infos->coreid = infos->logprocid / infos->max_nbthreads;
+    hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+  }
+
+  infos->numcaches = 0;
+  infos->cache = NULL;
+
+  /* Get apicid, nodeid, unitid from cpuid 0x8000001e
+   * and cache information from cpuid 0x8000001d
+   * (AMD topology extension)
+   */
+  if (cpuid_type != intel && has_topoext(features)) {
+    unsigned apic_id, node_id, nodes_per_proc, unit_id, cores_per_unit;
+
+    eax = 0x8000001e;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    infos->apicid = apic_id = eax;
+    infos->nodeid = node_id = ecx & 0xff;
+    nodes_per_proc = ((ecx >> 8) & 7) + 1;
+    if (nodes_per_proc > 2) {
+      hwloc_debug("warning: undefined value %d, assuming it means %d\n", nodes_per_proc, nodes_per_proc);
+    }
+    infos->unitid = unit_id = ebx & 0xff;
+    cores_per_unit = ((ebx >> 8) & 3) + 1;
+    hwloc_debug("x2APIC %08x, %d nodes, node %d, %d cores in unit %d\n", apic_id, nodes_per_proc, node_id, cores_per_unit, unit_id);
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned type;
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      type = eax & 0x1f;
+      if (type == 0)
+	break;
+      infos->numcaches++;
+    }
+
+    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      unsigned type;
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      if (type == 0)
+	break;
+
+      cache->type = type;
+      cache->level = (eax >> 5) & 0x7;
+      /* Note: actually number of cores */
+      cache->nbthreads_sharing = ((eax >> 14) &  0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+
+      if (eax & (1 << 9))
+	/* Fully associative */
+	cache->ways = -1;
+      else
+	cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusiveness = edx & 0x2;
+
+
+      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+      cache++;
+    }
+  } else {
+    /* If there's no topoext,
+     * get cache information from cpuid 0x80000005 and 0x80000006
+     * (not supported on Intel)
+     */
+    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000005) {
+      eax = 0x80000005;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      fill_amd_cache(infos, 1, 1, ecx); /* L1d */
+      fill_amd_cache(infos, 1, 2, edx); /* L1i */
+    }
+    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000006) {
+      eax = 0x80000006;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if (ecx & 0xf000)
+	/* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
+	 * Could be useful if some Intels (at least before Core micro-architecture)
+	 * support this leaf without leaf 0x4.
+	 */
+	fill_amd_cache(infos, 2, 3, ecx); /* L2u */
+      if (edx & 0xf000)
+	fill_amd_cache(infos, 3, 3, edx); /* L3u */
+      /* FIXME: AMD MagnyCours family 0x10 model 0x9 with 8 cores or more actually
+       * have the L3 split in two halves, and associativity is divided as well (48)
+       */
+    }
+  }
+
+  /* Get thread/core + cache information from cpuid 0x04
+   * (not supported on AMD)
+   */
+  if (cpuid_type != amd && highest_cpuid >= 0x04) {
+    for (cachenum = 0; ; cachenum++) {
+      unsigned type;
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      hwloc_debug("cache %u type %u\n", cachenum, type);
+
+      if (type == 0)
+	break;
+      infos->numcaches++;
+
+      if (!cachenum) {
+	/* by the way, get thread/core information from the first cache */
+	infos->max_nbcores = ((eax >> 26) & 0x3f) + 1;
+	infos->max_nbthreads = infos->max_log_proc / infos->max_nbcores;
+	hwloc_debug("thus %u threads\n", infos->max_nbthreads);
+	infos->threadid = infos->logprocid % infos->max_nbthreads;
+	infos->coreid = infos->logprocid / infos->max_nbthreads;
+	hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+      }
+    }
+
+    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      unsigned type;
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      if (type == 0)
+	break;
+
+      cache->type = type;
+      cache->level = (eax >> 5) & 0x7;
+      cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+      if (eax & (1 << 9))
+        /* Fully associative */
+        cache->ways = -1;
+      else
+        cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusiveness = edx & 0x2;
+
+      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+      cache++;
+    }
+  }
+
+  /* Get package/core/thread information from cpuid 0x0b
+   * (Intel x2APIC)
+   */
+  if (cpuid_type == intel && has_x2apic(features)) {
+    unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
+    for (level = 0; ; level++) {
+      ecx = level;
+      eax = 0x0b;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if (!eax && !ebx)
+        break;
+    }
+    if (level) {
+      infos->levels = level;
+      infos->otherids = malloc(level * sizeof(*infos->otherids));
+      for (level = 0; ; level++) {
+	ecx = level;
+	eax = 0x0b;
+	cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+	if (!eax && !ebx)
+	  break;
+	apic_nextshift = eax & 0x1f;
+	apic_number = ebx & 0xffff;
+	apic_type = (ecx & 0xff00) >> 8;
+	apic_id = edx;
+	id = (apic_id >> apic_shift) & ((1 << (apic_nextshift - apic_shift)) - 1);
+	hwloc_debug("x2APIC %08x %d: nextshift %d num %2d type %d id %2d\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
+	infos->apicid = apic_id;
+	infos->otherids[level] = UINT_MAX;
+	switch (apic_type) {
+	case 1:
+	  infos->threadid = id;
+	  break;
+	case 2:
+	  infos->coreid = id;
+	  break;
+	default:
+	  hwloc_debug("x2APIC %d: unknown type %d\n", level, apic_type);
+	  infos->otherids[level] = apic_id >> apic_shift;
+	  break;
+	}
+	apic_shift = apic_nextshift;
+      }
+      infos->apicid = apic_id;
+      infos->packageid = apic_id >> apic_shift;
+      hwloc_debug("x2APIC remainder: %d\n", infos->packageid);
+      hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+    }
+  }
+
+  if (hwloc_bitmap_isset(data->apicid_set, infos->apicid))
+    data->apicid_unique = 0;
+  else
+    hwloc_bitmap_set(data->apicid_set, infos->apicid);
+}
+
+static void
+hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int nodup)
+{
+  char number[8];
+  hwloc_obj_add_info_nodup(obj, "CPUVendor", info->cpuvendor, nodup);
+  snprintf(number, sizeof(number), "%u", info->cpufamilynumber);
+  hwloc_obj_add_info_nodup(obj, "CPUFamilyNumber", number, nodup);
+  snprintf(number, sizeof(number), "%u", info->cpumodelnumber);
+  hwloc_obj_add_info_nodup(obj, "CPUModelNumber", number, nodup);
+  if (info->cpumodel[0]) {
+    const char *c = info->cpumodel;
+    while (*c == ' ')
+      c++;
+    hwloc_obj_add_info_nodup(obj, "CPUModel", c, nodup);
+  }
+  snprintf(number, sizeof(number), "%u", info->cpustepping);
+  hwloc_obj_add_info_nodup(obj, "CPUStepping", number, nodup);
+}
+
+/* Analyse information stored in infos, and build/annotate topology levels accordingly */
+static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t complete_cpuset = hwloc_bitmap_alloc();
+  unsigned i, j, l, level, type;
+  unsigned nbpackages = 0;
+  int one = -1;
+  unsigned next_group_depth = topology->next_group_depth;
+
+  for (i = 0; i < nbprocs; i++)
+    if (infos[i].present) {
+      hwloc_bitmap_set(complete_cpuset, i);
+      one = i;
+    }
+
+  if (one == -1) {
+    hwloc_bitmap_free(complete_cpuset);
+    return;
+  }
+
+  /* Ideally, when fulldiscovery=0, we could add any object that doesn't exist yet.
+   * But what if the x86 and the native backends disagree because one is buggy? Which one to trust?
+   * Only annotate existing objects for now.
+   */
+
+ /*Anotate previously existing objects*/
+  if(!fulldiscovery){
+    hwloc_obj_t pu;
+    nbpackages = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+    for(pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU  ,NULL);
+     pu!=NULL;
+     pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU ,pu)){
+      unsigned infoId = pu->os_index;
+      if(infoId<0)
+        continue;
+      
+      int numCaches = infos[infoId].numcaches;
+      struct cacheinfo **caches = malloc(numCaches*sizeof(struct cacheinfo*));
+      int i;
+      for(i = 0 ;i<numCaches;i++){
+        caches[i] = &(infos[infoId].cache[i]);
+      }
+
+
+      hwloc_obj_t object;
+      for(object = pu;object!=NULL;object = object->parent) {
+        switch(object->type){
+        /* Annotate packages previously-existing cache */
+        case HWLOC_OBJ_CACHE:
+          {
+            if (hwloc_obj_get_info_by_name(object,"inclusiveness"))
+              break;
+            unsigned char type = 0;
+            switch(object->attr->cache.type){
+              case HWLOC_OBJ_CACHE_DATA : type = 1;
+                break;
+              case HWLOC_OBJ_CACHE_INSTRUCTION : type = 2;
+                break;
+              case HWLOC_OBJ_CACHE_UNIFIED : type = 3;
+                break;
+            }
+            int cacheId =-1; 
+            for(i=0;i<numCaches;i++)
+              if(caches[i]->level == object->attr->cache.depth){ // the level is exact, not always the type. If at the level there is a cache with the good type we return it. Else we return a random cache of the level. 
+                cacheId = i;
+                if(caches[i]->type == type)
+                  break;
+              }
+            hwloc_obj_add_info(object,"inclusiveness",caches[cacheId]->inclusiveness?"true":"false");
+
+          }
+          break;
+        case HWLOC_OBJ_PACKAGE:
+          { 
+            /* Annotate packages previously-existing package */
+	    // FIXME: ideally, we should check all bits in case x86 and the native backend disagree. 
+	       
+            //We already know the pakage from topology-linux. We only check if the package detected by x86 doesn't disagree
+	    if (infos[i].packageid == object->os_index || object->os_index == (unsigned) -1) { 
+	      hwloc_x86_add_cpuinfos(object, &infos[infoId], 1);
+            }
+          }
+        break;
+	default:
+	break;
+	}
+      }
+      free(caches);
+    }
+  }
+
+
+  /* Look for packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t packages_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t package_cpuset;
+    hwloc_obj_t package;
+
+    while ((i = hwloc_bitmap_first(packages_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+
+      package_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+        if (infos[j].packageid == packageid) {
+          hwloc_bitmap_set(package_cpuset, j);
+          hwloc_bitmap_clr(packages_cpuset, j);
+        }
+      }
+      package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, packageid);
+      package->cpuset = package_cpuset;
+
+      hwloc_x86_add_cpuinfos(package, &infos[i], 0);
+
+      hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+          packageid, package_cpuset);
+      hwloc_insert_object_by_cpuset(topology, package);
+      nbpackages++;
+    }
+    hwloc_bitmap_free(packages_cpuset);
+
+  }
+
+  /* If there was no package, annotate the Machine instead */
+  if ((!nbpackages) && infos[0].cpumodel[0]) {
+    hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[0], 1);
+  }
+
+  /* Look for Numa nodes inside packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t nodes_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t node_cpuset;
+    hwloc_obj_t node;
+
+    /* FIXME: if there's memory inside the root object, divide it into NUMA nodes? */
+
+    while ((i = hwloc_bitmap_first(nodes_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned nodeid = infos[i].nodeid;
+
+      if (nodeid == (unsigned)-1) {
+        hwloc_bitmap_clr(nodes_cpuset, i);
+	continue;
+      }
+
+      node_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].nodeid == (unsigned) -1) {
+	  hwloc_bitmap_clr(nodes_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].nodeid == nodeid) {
+          hwloc_bitmap_set(node_cpuset, j);
+          hwloc_bitmap_clr(nodes_cpuset, j);
+        }
+      }
+      node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, nodeid);
+      node->cpuset = node_cpuset;
+      node->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(node->nodeset, nodeid);
+      hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+          nodeid, node_cpuset);
+      hwloc_insert_object_by_cpuset(topology, node);
+    }
+    hwloc_bitmap_free(nodes_cpuset);
+  }
+
+  /* Look for Compute units inside packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t units_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t unit_cpuset;
+    hwloc_obj_t unit;
+
+    while ((i = hwloc_bitmap_first(units_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned unitid = infos[i].unitid;
+
+      if (unitid == (unsigned)-1) {
+        hwloc_bitmap_clr(units_cpuset, i);
+	continue;
+      }
+
+      unit_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].unitid == (unsigned) -1) {
+	  hwloc_bitmap_clr(units_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].unitid == unitid) {
+          hwloc_bitmap_set(unit_cpuset, j);
+          hwloc_bitmap_clr(units_cpuset, j);
+        }
+      }
+      unit = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unitid);
+      unit->cpuset = unit_cpuset;
+      hwloc_debug_1arg_bitmap("os unit %u has cpuset %s\n",
+          unitid, unit_cpuset);
+      hwloc_insert_object_by_cpuset(topology, unit);
+    }
+    hwloc_bitmap_free(units_cpuset);
+  }
+
+  /* Look for unknown objects */
+  if (infos[one].otherids) {
+    for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
+      if (infos[one].otherids[level] != UINT_MAX) {
+	hwloc_bitmap_t unknowns_cpuset = hwloc_bitmap_dup(complete_cpuset);
+	hwloc_bitmap_t unknown_cpuset;
+	hwloc_obj_t unknown_obj;
+
+	while ((i = hwloc_bitmap_first(unknowns_cpuset)) != (unsigned) -1) {
+	  unsigned unknownid = infos[i].otherids[level];
+
+	  unknown_cpuset = hwloc_bitmap_alloc();
+	  for (j = i; j < nbprocs; j++) {
+	    if (infos[j].otherids[level] == unknownid) {
+	      hwloc_bitmap_set(unknown_cpuset, j);
+	      hwloc_bitmap_clr(unknowns_cpuset, j);
+	    }
+	  }
+	  unknown_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unknownid);
+	  unknown_obj->cpuset = unknown_cpuset;
+	  unknown_obj->attr->group.depth = topology->next_group_depth + level;
+	  if (next_group_depth <= topology->next_group_depth + level)
+	    next_group_depth = topology->next_group_depth + level + 1;
+	  hwloc_debug_2args_bitmap("os unknown%d %u has cpuset %s\n",
+	      level, unknownid, unknown_cpuset);
+	  hwloc_insert_object_by_cpuset(topology, unknown_obj);
+	}
+	hwloc_bitmap_free(unknowns_cpuset);
+      }
+    }
+  }
+
+  /* Look for cores */
+  if (fulldiscovery) {
+    hwloc_bitmap_t cores_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t core_cpuset;
+    hwloc_obj_t core;
+
+    while ((i = hwloc_bitmap_first(cores_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned coreid = infos[i].coreid;
+
+      if (coreid == (unsigned) -1) {
+        hwloc_bitmap_clr(cores_cpuset, i);
+	continue;
+      }
+
+      core_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].coreid == (unsigned) -1) {
+	  hwloc_bitmap_clr(cores_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].coreid == coreid) {
+          hwloc_bitmap_set(core_cpuset, j);
+          hwloc_bitmap_clr(cores_cpuset, j);
+        }
+      }
+      core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, coreid);
+      core->cpuset = core_cpuset;
+      hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+          coreid, core_cpuset);
+      hwloc_insert_object_by_cpuset(topology, core);
+    }
+    hwloc_bitmap_free(cores_cpuset);
+  }
+
+  /* Look for caches */
+  /* First find max level */
+  level = 0;
+  for (i = 0; i < nbprocs; i++)
+    for (j = 0; j < infos[i].numcaches; j++)
+      if (infos[i].cache[j].level > level)
+        level = infos[i].cache[j].level;
+
+  /* Look for known types */
+  if (fulldiscovery) while (level > 0) {
+    for (type = 1; type <= 3; type++) {
+      /* Look for caches of that type at level level */
+      {
+	hwloc_bitmap_t caches_cpuset = hwloc_bitmap_dup(complete_cpuset);
+	hwloc_bitmap_t cache_cpuset;
+	hwloc_obj_t cache;
+
+	while ((i = hwloc_bitmap_first(caches_cpuset)) != (unsigned) -1) {
+	  unsigned packageid = infos[i].packageid;
+
+	  for (l = 0; l < infos[i].numcaches; l++) {
+	    if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
+	      break;
+	  }
+	  if (l == infos[i].numcaches) {
+	    /* no cache Llevel of that type in i */
+	    hwloc_bitmap_clr(caches_cpuset, i);
+	    continue;
+	  }
+
+	  /* Found a matching cache, now look for others sharing it */
+	  {
+	    unsigned cacheid = infos[i].apicid / infos[i].cache[l].nbthreads_sharing;
+
+	    cache_cpuset = hwloc_bitmap_alloc();
+	    for (j = i; j < nbprocs; j++) {
+	      unsigned l2;
+	      for (l2 = 0; l2 < infos[j].numcaches; l2++) {
+		if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
+		  break;
+	      }
+	      if (l2 == infos[j].numcaches) {
+		/* no cache Llevel of that type in j */
+		hwloc_bitmap_clr(caches_cpuset, j);
+		continue;
+	      }
+	      if (infos[j].packageid == packageid && infos[j].apicid / infos[j].cache[l2].nbthreads_sharing == cacheid) {
+		hwloc_bitmap_set(cache_cpuset, j);
+		hwloc_bitmap_clr(caches_cpuset, j);
+	      }
+	    }
+	    cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, cacheid);
+	    cache->attr->cache.depth = level;
+	    cache->attr->cache.size = infos[i].cache[l].size;
+	    cache->attr->cache.linesize = infos[i].cache[l].linesize;
+	    cache->attr->cache.associativity = infos[i].cache[l].ways;
+	    switch (infos[i].cache[l].type) {
+	      case 1:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+		break;
+	      case 2:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+		break;
+	      case 3:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+		break;
+	    }
+            hwloc_obj_add_info(cache,"inclusiveness",infos[i].cache[l].inclusiveness?"true":"false");
+	    cache->cpuset = cache_cpuset;
+	    hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
+		level, cacheid, cache_cpuset);
+	    hwloc_insert_object_by_cpuset(topology, cache);
+	  }
+	}
+	hwloc_bitmap_free(caches_cpuset);
+      }
+    }
+    level--;
+  }
+
+  for (i = 0; i < nbprocs; i++) {
+    free(infos[i].cache);
+    if (infos[i].otherids)
+      free(infos[i].otherids);
+  }
+
+  hwloc_bitmap_free(complete_cpuset);
+  topology->next_group_depth = next_group_depth;
+}
+
+static int
+look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery,
+	   unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type,
+	   int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags),
+	   int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags))
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t orig_cpuset = NULL;
+  hwloc_bitmap_t set = NULL;
+  unsigned i;
+
+  if (!data->src_cpuiddump_path) {
+    orig_cpuset = hwloc_bitmap_alloc();
+    if (get_cpubind(topology, orig_cpuset, HWLOC_CPUBIND_STRICT)) {
+      hwloc_bitmap_free(orig_cpuset);
+      return -1;
+    }
+    set = hwloc_bitmap_alloc();
+  }
+
+  for (i = 0; i < nbprocs; i++) {
+    struct cpuiddump *src_cpuiddump = NULL;
+    if (data->src_cpuiddump_path) {
+      src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, i);
+    } else {
+      hwloc_bitmap_only(set, i);
+      hwloc_debug("binding to CPU%d\n", i);
+      if (set_cpubind(topology, set, HWLOC_CPUBIND_STRICT)) {
+	hwloc_debug("could not bind to CPU%d: %s\n", i, strerror(errno));
+	continue;
+      }
+    }
+
+    look_proc(backend, &infos[i], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+
+    if (data->src_cpuiddump_path) {
+      cpuiddump_free(src_cpuiddump);
+    }
+  }
+
+  if (!data->src_cpuiddump_path) {
+    set_cpubind(topology, orig_cpuset, 0);
+    hwloc_bitmap_free(set);
+    hwloc_bitmap_free(orig_cpuset);
+  }
+
+  if (!data->apicid_unique)
+    fulldiscovery = 0;
+  summarize(backend, infos, fulldiscovery);
+  return fulldiscovery; /* success, but objects added only if fulldiscovery */
+}
+
+#if defined HWLOC_FREEBSD_SYS && defined HAVE_CPUSET_SETID
+#include <sys/param.h>
+#include <sys/cpuset.h>
+typedef cpusetid_t hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* temporary make all cpus available during discovery */
+    cpuset_getid(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, state);
+    cpuset_setid(CPU_WHICH_PID, -1, 0);
+  }
+}
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* restore initial cpuset */
+    cpuset_setid(CPU_WHICH_PID, -1, *state);
+  }
+}
+#else /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+typedef void * hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+#endif /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+
+
+#define INTEL_EBX ('G' | ('e'<<8) | ('n'<<16) | ('u'<<24))
+#define INTEL_EDX ('i' | ('n'<<8) | ('e'<<16) | ('I'<<24))
+#define INTEL_ECX ('n' | ('t'<<8) | ('e'<<16) | ('l'<<24))
+
+#define AMD_EBX ('A' | ('u'<<8) | ('t'<<16) | ('h'<<24))
+#define AMD_EDX ('e' | ('n'<<8) | ('t'<<16) | ('i'<<24))
+#define AMD_ECX ('c' | ('A'<<8) | ('M'<<16) | ('D'<<24))
+
+/* fake cpubind for when nbprocs=1 and no binding support */
+static int fake_get_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int fake_set_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_const_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+
+static
+int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned i;
+  unsigned highest_cpuid;
+  unsigned highest_ext_cpuid;
+  /* This stores cpuid features with the same indexing as Linux */
+  unsigned features[10] = { 0 };
+  struct procinfo *infos = NULL;
+  enum cpuid_type cpuid_type = unknown;
+  hwloc_x86_os_state_t os_state;
+  struct hwloc_binding_hooks hooks;
+  struct hwloc_topology_support support;
+  struct hwloc_topology_membind_support memsupport __hwloc_attribute_unused;
+  int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags) = NULL;
+  int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags) = NULL;
+  struct cpuiddump *src_cpuiddump = NULL;
+  int ret = -1;
+
+  if (data->src_cpuiddump_path) {
+    /* just read cpuid from the dump */
+    src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, 0);
+  } else {
+    /* otherwise check if binding works */
+    memset(&hooks, 0, sizeof(hooks));
+    support.membind = &memsupport;
+    hwloc_set_native_binding_hooks(&hooks, &support);
+    if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
+      get_cpubind = hooks.get_thisproc_cpubind;
+      set_cpubind = hooks.set_thisproc_cpubind;
+    } else if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
+      get_cpubind = hooks.get_thisthread_cpubind;
+      set_cpubind = hooks.set_thisthread_cpubind;
+    } else {
+      /* we need binding support if there are multiple PUs */
+      if (nbprocs > 1)
+	goto out;
+      get_cpubind = fake_get_cpubind;
+      set_cpubind = fake_set_cpubind;
+    }
+  }
+
+  if (!src_cpuiddump && !hwloc_have_x86_cpuid())
+    goto out;
+
+  infos = calloc(nbprocs, sizeof(struct procinfo));
+  if (NULL == infos)
+    goto out;
+  for (i = 0; i < nbprocs; i++) {
+    infos[i].nodeid = (unsigned) -1;
+    infos[i].packageid = (unsigned) -1;
+    infos[i].unitid = (unsigned) -1;
+    infos[i].coreid = (unsigned) -1;
+    infos[i].threadid = (unsigned) -1;
+  }
+
+  eax = 0x00;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_cpuid = eax;
+  if (ebx == INTEL_EBX && ecx == INTEL_ECX && edx == INTEL_EDX)
+    cpuid_type = intel;
+  if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
+    cpuid_type = amd;
+
+  hwloc_debug("highest cpuid %x, cpuid type %u\n", highest_cpuid, cpuid_type);
+  if (highest_cpuid < 0x01) {
+      goto out_with_infos;
+  }
+
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  features[0] = edx;
+  features[4] = ecx;
+
+  eax = 0x80000000;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_ext_cpuid = eax;
+
+  hwloc_debug("highest extended cpuid %x\n", highest_ext_cpuid);
+
+  if (highest_cpuid >= 0x7) {
+    eax = 0x7;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[9] = ebx;
+  }
+
+  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000001) {
+    eax = 0x80000001;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[1] = edx;
+    features[6] = ecx;
+  }
+
+  hwloc_x86_os_state_save(&os_state, src_cpuiddump);
+
+  ret = look_procs(backend, infos, fulldiscovery,
+		   highest_cpuid, highest_ext_cpuid, features, cpuid_type,
+		   get_cpubind, set_cpubind);
+  if (ret >= 0)
+    /* success, we're done */
+    goto out_with_os_state;
+
+  if (nbprocs == 1) {
+    /* only one processor, no need to bind */
+    look_proc(backend, &infos[0], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+    summarize(backend, infos, fulldiscovery);
+    ret = fulldiscovery;
+  }
+
+out_with_os_state:
+  hwloc_x86_os_state_restore(&os_state, src_cpuiddump);
+
+out_with_infos:
+  if (NULL != infos) {
+      free(infos);
+  }
+
+out:
+  if (src_cpuiddump)
+    cpuiddump_free(src_cpuiddump);
+  return ret;
+}
+
+static int
+hwloc_x86_discover(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  int alreadypus = 0;
+  int ret;
+
+  if (!data->src_cpuiddump_path) {
+    data->nbprocs = hwloc_fallback_nbprocessors(topology);
+
+    if (!topology->is_thissystem) {
+      hwloc_debug("%s", "\nno x86 detection (not thissystem)\n");
+      return 0;
+    }
+  }
+
+  if (topology->levels[0][0]->cpuset) {
+    /* somebody else discovered things */
+    if (topology->nb_levels == 2 && topology->level_nbobjects[1] == data->nbprocs) {
+      /* only PUs were discovered, as much as we would, complete the topology with everything else */
+      alreadypus = 1;
+      goto fulldiscovery;
+    }
+
+    /* several object types were added, we can't easily complete, just annotate a bit */
+    ret = hwloc_look_x86(backend, 0);
+    if (ret)
+      hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+    return 0;
+  } else {
+    /* topology is empty, initialize it */
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+  }
+
+fulldiscovery:
+  hwloc_look_x86(backend, 1);
+  /* if failed, just continue and create PUs */
+
+  if (!alreadypus)
+    hwloc_setup_pu_level(topology, data->nbprocs);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+
+  if (!data->src_cpuiddump_path) { /* CPUID dump works for both x86 and x86_64 */
+#ifdef HAVE_UNAME
+    hwloc_add_uname_info(topology, NULL); /* we already know is_thissystem() is true */
+#else
+    /* uname isn't available, manually setup the "Architecture" info */
+#ifdef HWLOC_X86_64_ARCH
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86_64");
+#else
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86");
+#endif
+#endif
+  }
+
+  return 1;
+}
+
+static int
+hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t set)
+{
+  struct dirent *dirent;
+  DIR *dir;
+  char *path;
+  FILE *file;
+  char line [32];
+
+  dir = opendir(src_cpuiddump_path);
+  if (!dir)
+    return -1;
+
+  path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1);
+  if (!path)
+    goto out_with_dir;
+
+  sprintf(path, "%s/hwloc-cpuid-info", src_cpuiddump_path);
+  file = fopen(path, "r");
+  if (!file) {
+    fprintf(stderr, "Couldn't open dumped cpuid summary %s\n", path);
+    free(path);
+    goto out_with_dir;
+  }
+  if (!fgets(line, sizeof(line), file)) {
+    fprintf(stderr, "Found read dumped cpuid summary in %s\n", path);
+    fclose(file);
+    free(path);
+    goto out_with_dir;
+  }
+  fclose(file);
+  if (strcmp(line, "Architecture: x86\n")) {
+    fprintf(stderr, "Found non-x86 dumped cpuid summary in %s: %s\n", path, line);
+    free(path);
+    goto out_with_dir;
+  }
+  free(path);
+
+  while ((dirent = readdir(dir)) != NULL) {
+    if (!strncmp(dirent->d_name, "pu", 2)) {
+      char *end;
+      unsigned long idx = strtoul(dirent->d_name+2, &end, 10);
+      if (!*end)
+	hwloc_bitmap_set(set, idx);
+      else
+	fprintf(stderr, "Ignoring invalid dirent `%s' in dumped cpuid directory `%s'\n",
+		dirent->d_name, src_cpuiddump_path);
+    }
+  }
+  closedir(dir);
+
+  if (hwloc_bitmap_iszero(set)) {
+    fprintf(stderr, "Did not find any valid pu%%u entry in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  } else if (hwloc_bitmap_last(set) != hwloc_bitmap_weight(set) - 1) {
+    /* The x86 backends enforces contigous set of PUs starting at 0 so far */
+    fprintf(stderr, "Found non-contigous pu%%u range in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  }
+
+  return 0;
+
+out_with_dir:
+  closedir(dir);
+  return -1;
+}
+
+static void
+hwloc_x86_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_free(data->apicid_set);
+  if (data->src_cpuiddump_path)
+    free(data->src_cpuiddump_path);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_x86_backend_data_s *data;
+  const char *src_cpuiddump_path;
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+  backend->discover = hwloc_x86_discover;
+  backend->disable = hwloc_x86_backend_disable;
+
+  /* default values */
+  data->apicid_set = hwloc_bitmap_alloc();
+  data->apicid_unique = 1;
+  data->src_cpuiddump_path = NULL;
+
+  src_cpuiddump_path = getenv("HWLOC_CPUID_PATH");
+  if (src_cpuiddump_path) {
+    hwloc_bitmap_t set = hwloc_bitmap_alloc();
+    if (!hwloc_x86_check_cpuiddump_input(src_cpuiddump_path, set)) {
+      backend->is_thissystem = 0;
+      data->src_cpuiddump_path = strdup(src_cpuiddump_path);
+      data->nbprocs = hwloc_bitmap_weight(set);
+    } else {
+      fprintf(stderr, "Ignoring dumped cpuid directory.\n");
+    }
+    hwloc_bitmap_free(set);
+  }
+
+  return backend;
+
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_x86_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "x86",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_x86_component_instantiate,
+  45, /* between native and no_os */
+  NULL
+};
+
+const struct hwloc_component hwloc_x86_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_x86_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology.c b/ext/hwloc/hwloc/topology.c
new file mode 100644
index 0000000..a67d036
--- /dev/null
+++ b/ext/hwloc/hwloc/topology.c
@@ -0,0 +1,3436 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#define _ATFILE_SOURCE
+#include <assert.h>
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <float.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#ifdef HAVE_MACH_MACH_INIT_H
+#include <mach/mach_init.h>
+#endif
+#ifdef HAVE_MACH_MACH_HOST_H
+#include <mach/mach_host.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HWLOC_WIN_SYS
+#include <windows.h>
+#endif
+
+unsigned hwloc_get_api_version(void)
+{
+  return HWLOC_API_VERSION;
+}
+
+int hwloc_hide_errors(void)
+{
+  static int hide = 0;
+  static int checked = 0;
+  if (!checked) {
+    const char *envvar = getenv("HWLOC_HIDE_ERRORS");
+    if (envvar)
+      hide = atoi(envvar);
+    checked = 1;
+  }
+  return hide;
+}
+
+void hwloc_report_os_error(const char *msg, int line)
+{
+    static int reported = 0;
+
+    if (!reported && !hwloc_hide_errors()) {
+        fprintf(stderr, "****************************************************************************\n");
+        fprintf(stderr, "* hwloc %s has encountered what looks like an error from the operating system.\n", HWLOC_VERSION);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* %s\n", msg);
+        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
+        fprintf(stderr, "*   What should I do when hwloc reports \"operating system\" warnings?\n");
+        fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+        fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+#else
+	fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+        fprintf(stderr, "****************************************************************************\n");
+        reported = 1;
+    }
+}
+
+#if defined(HAVE_SYSCTLBYNAME)
+int hwloc_get_sysctlbyname(const char *name, int64_t *ret)
+{
+  union {
+    int32_t i32;
+    int64_t i64;
+  } n;
+  size_t size = sizeof(n);
+  if (sysctlbyname(name, &n, &size, NULL, 0))
+    return -1;
+  switch (size) {
+    case sizeof(n.i32):
+      *ret = n.i32;
+      break;
+    case sizeof(n.i64):
+      *ret = n.i64;
+      break;
+    default:
+      return -1;
+  }
+  return 0;
+}
+#endif
+
+#if defined(HAVE_SYSCTL)
+int hwloc_get_sysctl(int name[], unsigned namelen, int *ret)
+{
+  int n;
+  size_t size = sizeof(n);
+  if (sysctl(name, namelen, &n, &size, NULL, 0))
+    return -1;
+  if (size != sizeof(n))
+    return -1;
+  *ret = n;
+  return 0;
+}
+#endif
+
+/* Return the OS-provided number of processors.  Unlike other methods such as
+   reading sysfs on Linux, this method is not virtualizable; thus it's only
+   used as a fall-back method, allowing virtual backends (FSROOT, etc) to
+   have the desired effect.  */
+unsigned
+hwloc_fallback_nbprocessors(struct hwloc_topology *topology) {
+  int n;
+#if HAVE_DECL__SC_NPROCESSORS_ONLN
+  n = sysconf(_SC_NPROCESSORS_ONLN);
+#elif HAVE_DECL__SC_NPROC_ONLN
+  n = sysconf(_SC_NPROC_ONLN);
+#elif HAVE_DECL__SC_NPROCESSORS_CONF
+  n = sysconf(_SC_NPROCESSORS_CONF);
+#elif HAVE_DECL__SC_NPROC_CONF
+  n = sysconf(_SC_NPROC_CONF);
+#elif defined(HAVE_HOST_INFO) && HAVE_HOST_INFO
+  struct host_basic_info info;
+  mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (integer_t*) &info, &count);
+  n = info.avail_cpus;
+#elif defined(HAVE_SYSCTLBYNAME)
+  int64_t nn;
+  if (hwloc_get_sysctlbyname("hw.ncpu", &nn))
+    nn = -1;
+  n = nn;
+#elif defined(HAVE_SYSCTL) && HAVE_DECL_CTL_HW && HAVE_DECL_HW_NCPU
+  static int name[2] = {CTL_HW, HW_NPCU};
+  if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name)), &n)
+    n = -1;
+#elif defined(HWLOC_WIN_SYS)
+  SYSTEM_INFO sysinfo;
+  GetSystemInfo(&sysinfo);
+  n = sysinfo.dwNumberOfProcessors;
+#else
+#ifdef __GNUC__
+#warning No known way to discover number of available processors on this system
+#warning hwloc_fallback_nbprocessors will default to 1
+#endif
+  n = -1;
+#endif
+  if (n >= 1)
+    topology->support.discovery->pu = 1;
+  else
+    n = 1;
+  return n;
+}
+
+/*
+ * Use the given number of processors to set a PU level.
+ */
+void
+hwloc_setup_pu_level(struct hwloc_topology *topology,
+		     unsigned nb_pus)
+{
+  struct hwloc_obj *obj;
+  unsigned oscpu,cpu;
+
+  hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+  for (cpu=0,oscpu=0; cpu<nb_pus; oscpu++)
+    {
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, oscpu);
+      obj->cpuset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(obj->cpuset, oscpu);
+
+      hwloc_debug_2args_bitmap("cpu %u (os %u) has cpuset %s\n",
+		 cpu, oscpu, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+
+      cpu++;
+    }
+}
+
+#ifdef HWLOC_DEBUG
+/* Just for debugging.  */
+static void
+hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  char type[64], idx[10], attr[1024], *cpuset = NULL;
+  hwloc_debug("%*s", 2*indent, "");
+  hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
+  if (obj->os_index != (unsigned) -1)
+    snprintf(idx, sizeof(idx), "#%u", obj->os_index);
+  else
+    *idx = '\0';
+  hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", 1);
+  hwloc_debug("%s%s%s%s%s", type, idx, *attr ? "(" : "", attr, *attr ? ")" : "");
+  if (obj->name)
+    hwloc_debug(" name %s", obj->name);
+  if (obj->cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
+    hwloc_debug(" cpuset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_cpuset);
+    hwloc_debug(" complete %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->allowed_cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->allowed_cpuset);
+    hwloc_debug(" allowed %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->nodeset);
+    hwloc_debug(" nodeset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_nodeset);
+    hwloc_debug(" completeN %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->allowed_nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->allowed_nodeset);
+    hwloc_debug(" allowedN %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->arity)
+    hwloc_debug(" arity %u", obj->arity);
+  hwloc_debug("%s", "\n");
+}
+
+static void
+hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  hwloc_debug_print_object(indent, obj);
+  for (child = obj->first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+  for (child = obj->io_first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+  for (child = obj->misc_first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+}
+#else /* !HWLOC_DEBUG */
+#define hwloc_debug_print_object(indent, obj) do { /* nothing */ } while (0)
+#define hwloc_debug_print_objects(indent, obj) do { /* nothing */ } while (0)
+#endif /* !HWLOC_DEBUG */
+
+void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count)
+{
+  unsigned i;
+  for(i=0; i<count; i++) {
+    free(infos[i].name);
+    free(infos[i].value);
+  }
+  free(infos);
+}
+
+void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value)
+{
+  unsigned count = *countp;
+  struct hwloc_obj_info_s *infos = *infosp;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (count + 1 + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (count != alloccount)
+    infos = realloc(infos, alloccount*sizeof(*infos));
+  infos[count].name = strdup(name);
+  infos[count].value = value ? strdup(value) : NULL;
+  *infosp = infos;
+  *countp = count+1;
+}
+
+char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name)
+{
+  unsigned i;
+  for(i=0; i<*countp; i++) {
+    if (!strcmp((*infosp)[i].name, name))
+      return &(*infosp)[i].value;
+  }
+  hwloc__add_info(infosp, countp, name, NULL);
+  return &(*infosp)[*countp-1].value;
+}
+
+void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp,
+		       struct hwloc_obj_info_s **src_infosp, unsigned *src_countp)
+{
+  unsigned dst_count = *dst_countp;
+  struct hwloc_obj_info_s *dst_infos = *dst_infosp;
+  unsigned src_count = *src_countp;
+  struct hwloc_obj_info_s *src_infos = *src_infosp;
+  unsigned i;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (dst_count + src_count + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (dst_count != alloccount)
+    dst_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+  for(i=0; i<src_count; i++, dst_count++) {
+    dst_infos[dst_count].name = src_infos[i].name;
+    dst_infos[dst_count].value = src_infos[i].value;
+  }
+  *dst_infosp = dst_infos;
+  *dst_countp = dst_count;
+  free(src_infos);
+  *src_infosp = NULL;
+  *src_countp = 0;
+}
+
+void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
+{
+  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup)
+{
+  if (nodup && hwloc_obj_get_info_by_name(obj, name))
+    return;
+  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+static int hwloc_obj_type_is_special (hwloc_obj_type_t type)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC + 1 == HWLOC_OBJ_BRIDGE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  return type >= HWLOC_OBJ_MISC && type <= HWLOC_OBJ_OS_DEVICE;
+}
+static int hwloc_obj_type_is_io (hwloc_obj_type_t type)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
+}
+
+/* Traverse children of a parent in a safe way: reread the next pointer as
+ * appropriate to prevent crash on child deletion:  */
+#define for_each_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_io_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->io_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_misc_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->misc_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+
+/* Free an object and all its content.  */
+void
+hwloc_free_unlinked_object(hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  default:
+    break;
+  }
+  hwloc__free_infos(obj->infos, obj->infos_count);
+  hwloc_clear_object_distances(obj);
+  free(obj->memory.page_types);
+  free(obj->attr);
+  free(obj->children);
+  free(obj->name);
+  hwloc_bitmap_free(obj->cpuset);
+  hwloc_bitmap_free(obj->complete_cpuset);
+  hwloc_bitmap_free(obj->allowed_cpuset);
+  hwloc_bitmap_free(obj->nodeset);
+  hwloc_bitmap_free(obj->complete_nodeset);
+  hwloc_bitmap_free(obj->allowed_nodeset);
+  free(obj);
+}
+
+/* insert the (non-empty) list of sibling starting at firstnew as new children of newparent,
+ * and return the address of the pointer to the next one
+ */
+static hwloc_obj_t *
+insert_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t tmp;
+  assert(firstnew);
+  *firstp = tmp = firstnew;
+  tmp->parent = newparent;
+  while (tmp->next_sibling) {
+    tmp = tmp->next_sibling;
+    tmp->parent = newparent;
+  }
+  return &tmp->next_sibling;
+}
+
+static void
+append_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t *tmpp, tmp;
+  /* find the end of the list */
+  for(tmpp = firstp ; *tmpp; tmpp = &((*tmpp)->next_sibling));
+  *tmpp = firstnew;
+  /* update parent pointers */
+  for(tmp = firstnew; tmp; tmp = tmp->next_sibling)
+    tmp->parent = newparent;
+}
+
+/* Remove an object from its parent and free it.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ *
+ * Children are inserted in the parent.
+ * If children should be inserted somewhere else (e.g. when merging with a child),
+ * the caller should move them before calling this function.
+ */
+static void
+unlink_and_free_single_object(hwloc_obj_t *pparent)
+{
+  hwloc_obj_t old = *pparent;
+  hwloc_obj_t *lastp;
+
+  if (old->type == HWLOC_OBJ_MISC) {
+    /* Misc object */
+
+    /* no normal children */
+    assert(!old->first_child);
+
+    /* no I/O children */
+    assert(!old->io_first_child);
+
+    if (old->misc_first_child)
+      /* insert old misc object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->misc_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+  } else if (hwloc_obj_type_is_io(old->type)) {
+    /* I/O object */
+
+    /* no normal children */
+    assert(!old->first_child);
+
+    if (old->io_first_child)
+      /* insert old I/O object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->io_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old Misc children to parent */
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
+  } else {
+    /* Normal object */
+
+    if (old->first_child)
+      /* insert old object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old I/O and Misc children to parent
+     * old->parent cannot be NULL (removing root), misc children should have been moved by the caller earlier.
+     */
+    if (old->io_first_child)
+      append_siblings_list(&old->parent->io_first_child, old->io_first_child, old->parent);
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+  }
+
+  hwloc_free_unlinked_object(old);
+}
+
+/* Remove an object and its children from its parent and free them.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ */
+static void
+unlink_and_free_object_and_children(hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_io_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_misc_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+
+  *pobj = obj->next_sibling;
+  hwloc_free_unlinked_object(obj);
+}
+
+static void
+hwloc__duplicate_object(struct hwloc_obj *newobj,
+			struct hwloc_obj *src)
+{
+  size_t len;
+  unsigned i;
+
+  newobj->type = src->type;
+  newobj->os_index = src->os_index;
+
+  if (src->name)
+    newobj->name = strdup(src->name);
+  newobj->userdata = src->userdata;
+
+  memcpy(&newobj->memory, &src->memory, sizeof(struct hwloc_obj_memory_s));
+  if (src->memory.page_types_len) {
+    len = src->memory.page_types_len * sizeof(struct hwloc_obj_memory_page_type_s);
+    newobj->memory.page_types = malloc(len);
+    memcpy(newobj->memory.page_types, src->memory.page_types, len);
+  }
+
+  memcpy(newobj->attr, src->attr, sizeof(*newobj->attr));
+
+  newobj->cpuset = hwloc_bitmap_dup(src->cpuset);
+  newobj->complete_cpuset = hwloc_bitmap_dup(src->complete_cpuset);
+  newobj->allowed_cpuset = hwloc_bitmap_dup(src->allowed_cpuset);
+  newobj->nodeset = hwloc_bitmap_dup(src->nodeset);
+  newobj->complete_nodeset = hwloc_bitmap_dup(src->complete_nodeset);
+  newobj->allowed_nodeset = hwloc_bitmap_dup(src->allowed_nodeset);
+
+  /* don't duplicate distances, they'll be recreated at the end of the topology build */
+
+  for(i=0; i<src->infos_count; i++)
+    hwloc__add_info(&newobj->infos, &newobj->infos_count, src->infos[i].name, src->infos[i].value);
+}
+
+void
+hwloc__duplicate_objects(struct hwloc_topology *newtopology,
+			 struct hwloc_obj *newparent,
+			 struct hwloc_obj *src)
+{
+  hwloc_obj_t newobj;
+  hwloc_obj_t child;
+
+  newobj = hwloc_alloc_setup_object(src->type, src->os_index);
+  hwloc__duplicate_object(newobj, src);
+
+  for(child = src->first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+  for(child = src->io_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+  for(child = src->misc_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+
+  /* no need to check the children order here, the source topology
+   * is supposed to be OK already, and we have debug asserts.
+   */
+  hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+}
+
+int
+hwloc_topology_dup(hwloc_topology_t *newp,
+		   hwloc_topology_t old)
+{
+  hwloc_topology_t new;
+  hwloc_obj_t newroot;
+  hwloc_obj_t oldroot = hwloc_get_root_obj(old);
+  hwloc_obj_t child;
+
+  if (!old->is_loaded) {
+    errno = -EINVAL;
+    return -1;
+  }
+
+  hwloc_topology_init(&new);
+
+  new->flags = old->flags;
+  memcpy(new->ignored_types, old->ignored_types, sizeof(old->ignored_types));
+  new->is_thissystem = old->is_thissystem;
+  new->is_loaded = 1;
+  new->pid = old->pid;
+
+  memcpy(&new->binding_hooks, &old->binding_hooks, sizeof(old->binding_hooks));
+
+  memcpy(new->support.discovery, old->support.discovery, sizeof(*old->support.discovery));
+  memcpy(new->support.cpubind, old->support.cpubind, sizeof(*old->support.cpubind));
+  memcpy(new->support.membind, old->support.membind, sizeof(*old->support.membind));
+
+  new->userdata_export_cb = old->userdata_export_cb;
+  new->userdata_import_cb = old->userdata_import_cb;
+
+  newroot = hwloc_get_root_obj(new);
+  hwloc__duplicate_object(newroot, oldroot);
+
+  for(child = oldroot->first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+  for(child = oldroot->io_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+  for(child = oldroot->misc_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+
+  if (old->first_osdist) {
+    struct hwloc_os_distances_s *olddist = old->first_osdist;
+    while (olddist) {
+      struct hwloc_os_distances_s *newdist = malloc(sizeof(*newdist));
+      newdist->type = olddist->type;
+      newdist->nbobjs = olddist->nbobjs;
+      newdist->indexes = malloc(newdist->nbobjs * sizeof(*newdist->indexes));
+      memcpy(newdist->indexes, olddist->indexes, newdist->nbobjs * sizeof(*newdist->indexes));
+      newdist->objs = NULL; /* will be recomputed when needed */
+      newdist->distances = malloc(newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+      memcpy(newdist->distances, olddist->distances, newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+
+      newdist->forced = olddist->forced;
+      if (new->first_osdist) {
+	new->last_osdist->next = newdist;
+	newdist->prev = new->last_osdist;
+      } else {
+	new->first_osdist = newdist;
+	newdist->prev = NULL;
+      }
+      new->last_osdist = newdist;
+      newdist->next = NULL;
+
+      olddist = olddist->next;
+    }
+  } else
+    new->first_osdist = old->last_osdist = NULL;
+
+  /* no need to duplicate backends, topology is already loaded */
+  new->backends = NULL;
+
+  hwloc_connect_children(new->levels[0][0]);
+  if (hwloc_connect_levels(new) < 0)
+    goto out;
+  new->modified = 0;
+
+  hwloc_distances_finalize_os(new);
+  hwloc_distances_finalize_logical(new);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(new);
+
+  *newp = new;
+  return 0;
+
+ out:
+  hwloc_topology_clear(new);
+  hwloc_distances_destroy(new);
+  hwloc_topology_setup_defaults(new);
+  return -1;
+}
+
+/*
+ * How to compare objects based on types.
+ *
+ * Note that HIGHER/LOWER is only a (consistent) heuristic, used to sort
+ * objects with same cpuset consistently.
+ * Only EQUAL / not EQUAL can be relied upon.
+ */
+
+enum hwloc_type_cmp_e {
+  HWLOC_TYPE_HIGHER,
+  HWLOC_TYPE_DEEPER,
+  HWLOC_TYPE_EQUAL
+};
+
+/* WARNING: The indexes of this array MUST match the ordering that of
+   the obj_order_type[] array, below.  Specifically, the values must
+   be laid out such that:
+
+       obj_order_type[obj_type_order[N]] = N
+
+   for all HWLOC_OBJ_* values of N.  Put differently:
+
+       obj_type_order[A] = B
+
+   where the A values are in order of the hwloc_obj_type_t enum, and
+   the B values are the corresponding indexes of obj_order_type.
+
+   We can't use C99 syntax to initialize this in a little safer manner
+   -- bummer.  :-(
+
+   *************************************************************
+   *** DO NOT CHANGE THE ORDERING OF THIS ARRAY WITHOUT TRIPLE
+   *** CHECKING ITS CORRECTNESS!
+   *************************************************************
+   */
+static const unsigned obj_type_order[] = {
+    /* first entry is HWLOC_OBJ_SYSTEM */  0,
+    /* next entry is HWLOC_OBJ_MACHINE */  1,
+    /* next entry is HWLOC_OBJ_NUMANODE */ 3,
+    /* next entry is HWLOC_OBJ_PACKAGE */  4,
+    /* next entry is HWLOC_OBJ_CACHE */    5,
+    /* next entry is HWLOC_OBJ_CORE */     6,
+    /* next entry is HWLOC_OBJ_PU */       10,
+    /* next entry is HWLOC_OBJ_GROUP */    2,
+    /* next entry is HWLOC_OBJ_MISC */     11,
+    /* next entry is HWLOC_OBJ_BRIDGE */   7,
+    /* next entry is HWLOC_OBJ_PCI_DEVICE */  8,
+    /* next entry is HWLOC_OBJ_OS_DEVICE */   9
+};
+
+static const hwloc_obj_type_t obj_order_type[] = {
+  HWLOC_OBJ_SYSTEM,
+  HWLOC_OBJ_MACHINE,
+  HWLOC_OBJ_GROUP,
+  HWLOC_OBJ_NUMANODE,
+  HWLOC_OBJ_PACKAGE,
+  HWLOC_OBJ_CACHE,
+  HWLOC_OBJ_CORE,
+  HWLOC_OBJ_BRIDGE,
+  HWLOC_OBJ_PCI_DEVICE,
+  HWLOC_OBJ_OS_DEVICE,
+  HWLOC_OBJ_PU,
+  HWLOC_OBJ_MISC,
+};
+
+/* priority to be used when merging identical parent/children object
+ * (in merge_useless_child), keep the highest priority one.
+ *
+ * Always keep Machine/PU/PCIDev/OSDev
+ * then System/Node
+ * then Core
+ * then Package
+ * then Cache
+ * then always drop Group/Misc/Bridge.
+ *
+ * Some type won't actually ever be involved in such merging.
+ */
+static const int obj_type_priority[] = {
+  /* first entry is HWLOC_OBJ_SYSTEM */     80,
+  /* next entry is HWLOC_OBJ_MACHINE */     90,
+  /* next entry is HWLOC_OBJ_NUMANODE */    100,
+  /* next entry is HWLOC_OBJ_PACKAGE */     40,
+  /* next entry is HWLOC_OBJ_CACHE */       20,
+  /* next entry is HWLOC_OBJ_CORE */        60,
+  /* next entry is HWLOC_OBJ_PU */          100,
+  /* next entry is HWLOC_OBJ_GROUP */       0,
+  /* next entry is HWLOC_OBJ_MISC */        0,
+  /* next entry is HWLOC_OBJ_BRIDGE */      0,
+  /* next entry is HWLOC_OBJ_PCI_DEVICE */  100,
+  /* next entry is HWLOC_OBJ_OS_DEVICE */   100
+};
+
+static unsigned __hwloc_attribute_const
+hwloc_get_type_order(hwloc_obj_type_t type)
+{
+  return obj_type_order[type];
+}
+
+#if !defined(NDEBUG)
+static hwloc_obj_type_t hwloc_get_order_type(int order)
+{
+  return obj_order_type[order];
+}
+#endif
+
+int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2)
+{
+  unsigned order1 = hwloc_get_type_order(type1);
+  unsigned order2 = hwloc_get_type_order(type2);
+
+  /* I/O are only comparable with each others and with machine and system */
+  if (hwloc_obj_type_is_io(type1)
+      && !hwloc_obj_type_is_io(type2) && type2 != HWLOC_OBJ_SYSTEM && type2 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+  if (hwloc_obj_type_is_io(type2)
+      && !hwloc_obj_type_is_io(type1) && type1 != HWLOC_OBJ_SYSTEM && type1 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+
+  return order1 - order2;
+}
+
+static enum hwloc_type_cmp_e
+hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_obj_type_t type1 = obj1->type;
+  hwloc_obj_type_t type2 = obj2->type;
+  int compare;
+
+  compare = hwloc_compare_types(type1, type2);
+  if (compare == HWLOC_TYPE_UNORDERED)
+    return HWLOC_TYPE_EQUAL; /* we cannot do better */
+  if (compare > 0)
+    return HWLOC_TYPE_DEEPER;
+  if (compare < 0)
+    return HWLOC_TYPE_HIGHER;
+
+  /* Caches have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_CACHE) {
+    if (obj1->attr->cache.depth < obj2->attr->cache.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->cache.depth > obj2->attr->cache.depth)
+      return HWLOC_TYPE_HIGHER;
+    else if (obj1->attr->cache.type > obj2->attr->cache.type)
+      /* consider icache deeper than dcache and dcache deeper than unified */
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->cache.type < obj2->attr->cache.type)
+      /* consider icache deeper than dcache and dcache deeper than unified */
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  /* Group objects have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_GROUP) {
+    if (obj1->attr->group.depth == (unsigned) -1
+	|| obj2->attr->group.depth == (unsigned) -1)
+      return HWLOC_TYPE_EQUAL;
+    if (obj1->attr->group.depth < obj2->attr->group.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->group.depth > obj2->attr->group.depth)
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  /* Bridges objects have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_BRIDGE) {
+    if (obj1->attr->bridge.depth < obj2->attr->bridge.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->bridge.depth > obj2->attr->bridge.depth)
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  return HWLOC_TYPE_EQUAL;
+}
+
+/*
+ * How to compare objects based on cpusets.
+ */
+
+enum hwloc_obj_cmp_e {
+  HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL,			/**< \brief Equal */
+  HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED,		/**< \brief Strictly included into */
+  HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS,		/**< \brief Strictly contains */
+  HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS,	/**< \brief Intersects, but no inclusion! */
+  HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT		/**< \brief No intersection */
+};
+
+static int
+hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_bitmap_t set1, set2;
+  int res = HWLOC_OBJ_DIFFERENT;
+
+  assert(!hwloc_obj_type_is_special(obj1->type));
+  assert(!hwloc_obj_type_is_special(obj2->type));
+
+  /* compare cpusets first */
+  if (obj1->complete_cpuset && obj2->complete_cpuset) {
+    set1 = obj1->complete_cpuset;
+    set2 = obj2->complete_cpuset;
+  } else {
+    set1 = obj1->cpuset;
+    set2 = obj2->cpuset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    res = hwloc_bitmap_compare_inclusion(set1, set2);
+    if (res == HWLOC_OBJ_INTERSECTS)
+      return HWLOC_OBJ_INTERSECTS;
+  }
+
+  /* then compare nodesets, and combine the results */
+  if (obj1->complete_nodeset && obj2->complete_nodeset) {
+    set1 = obj1->complete_nodeset;
+    set2 = obj2->complete_nodeset;
+  } else {
+    set1 = obj1->nodeset;
+    set2 = obj2->nodeset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    int noderes = hwloc_bitmap_compare_inclusion(set1, set2);
+    /* deal with conflicting cpusets/nodesets inclusions */
+    if (noderes == HWLOC_OBJ_INCLUDED) {
+      if (res == HWLOC_OBJ_CONTAINS)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_INCLUDED;
+
+    } else if (noderes == HWLOC_OBJ_CONTAINS) {
+      if (res == HWLOC_OBJ_INCLUDED)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_CONTAINS;
+
+    } else if (noderes == HWLOC_OBJ_INTERSECTS) {
+      return HWLOC_OBJ_INTERSECTS;
+
+    } else {
+      /* nodesets are different, keep the cpuset order */
+      /* FIXME: with upcoming multiple levels of NUMA, we may have to report INCLUDED or CONTAINED here */
+
+    }
+  }
+
+  return res;
+}
+
+static int
+hwloc_obj_cmp_types(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* Same sets, subsort by type to have a consistent ordering.  */
+  int typeres = hwloc_type_cmp(obj1, obj2);
+  if (typeres == HWLOC_TYPE_DEEPER)
+    return HWLOC_OBJ_INCLUDED;
+  if (typeres == HWLOC_TYPE_HIGHER)
+    return HWLOC_OBJ_CONTAINS;
+
+  /* Same sets and types!  Let's hope it's coherent.  */
+  return HWLOC_OBJ_EQUAL;
+}
+
+/* Compare object cpusets based on complete_cpuset if defined (always correctly ordered),
+ * or fallback to the main cpusets (only correctly ordered during early insert before disallowed bits are cleared).
+ *
+ * This is the sane way to compare object among a horizontal level.
+ */
+int
+hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  if (obj1->complete_cpuset && obj2->complete_cpuset)
+    return hwloc_bitmap_compare_first(obj1->complete_cpuset, obj2->complete_cpuset);
+  else
+    return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset);
+}
+
+/* format the obj info to print in error messages */
+static void
+hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
+{
+	char typestr[64];
+	char *cpusetstr;
+	hwloc_obj_type_snprintf(typestr, sizeof(typestr), obj, 0);
+	hwloc_bitmap_asprintf(&cpusetstr, obj->cpuset);
+	if (obj->os_index != (unsigned) -1)
+	  snprintf(buf, buflen, "%s (P#%u cpuset %s)",
+		   typestr, obj->os_index, cpusetstr);
+	else
+	  snprintf(buf, buflen, "%s (cpuset %s)",
+		   typestr, cpusetstr);
+	free(cpusetstr);
+}
+
+/*
+ * How to insert objects into the topology.
+ *
+ * Note: during detection, only the first_child and next_sibling pointers are
+ * kept up to date.  Others are computed only once topology detection is
+ * complete.
+ */
+
+#define merge_index(new, old, field, type) \
+  if ((old)->field == (type) -1) \
+    (old)->field = (new)->field;
+#define merge_sizes(new, old, field) \
+  if (!(old)->field) \
+    (old)->field = (new)->field;
+#ifdef HWLOC_DEBUG
+#define check_sizes(new, old, field) \
+  if ((new)->field) \
+    assert((old)->field == (new)->field)
+#else
+#define check_sizes(new, old, field)
+#endif
+
+static void
+merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
+{
+  merge_index(new, old, os_index, unsigned);
+
+  if (new->distances_count) {
+    if (old->distances_count) {
+      old->distances_count += new->distances_count;
+      old->distances = realloc(old->distances, old->distances_count * sizeof(*old->distances));
+      memcpy(old->distances + new->distances_count, new->distances, new->distances_count * sizeof(*old->distances));
+      free(new->distances);
+    } else {
+      old->distances_count = new->distances_count;
+      old->distances = new->distances;
+    }
+    new->distances_count = 0;
+    new->distances = NULL;
+  }
+
+  if (new->infos_count) {
+    hwloc__move_infos(&old->infos, &old->infos_count,
+		      &new->infos, &new->infos_count);
+  }
+
+  if (new->name) {
+    if (old->name)
+      free(old->name);
+    old->name = new->name;
+    new->name = NULL;
+  }
+
+  /* Ignore userdata. It will be NULL before load().
+   * It may be non-NULL if alloc+insert_group() after load().
+   */
+
+  switch(new->type) {
+  case HWLOC_OBJ_NUMANODE:
+    /* Do not check these, it may change between calls */
+    merge_sizes(new, old, memory.local_memory);
+    merge_sizes(new, old, memory.total_memory);
+    /* if both newects have a page_types array, just keep the biggest one for now */
+    if (new->memory.page_types_len && old->memory.page_types_len)
+      hwloc_debug("%s", "merging page_types by keeping the biggest one only\n");
+    if (new->memory.page_types_len < old->memory.page_types_len) {
+      free(new->memory.page_types);
+    } else {
+      free(old->memory.page_types);
+      old->memory.page_types_len = new->memory.page_types_len;
+      old->memory.page_types = new->memory.page_types;
+      new->memory.page_types = NULL;
+      new->memory.page_types_len = 0;
+    }
+    break;
+  case HWLOC_OBJ_CACHE:
+    merge_sizes(new, old, attr->cache.size);
+    check_sizes(new, old, attr->cache.size);
+    merge_sizes(new, old, attr->cache.linesize);
+    check_sizes(new, old, attr->cache.linesize);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Try to insert OBJ in CUR, recurse if needed.
+ * Returns the object if it was inserted,
+ * the remaining object it was merged,
+ * NULL if failed to insert.
+ */
+static struct hwloc_obj *
+hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur, hwloc_obj_t obj,
+			        hwloc_report_error_t report_error)
+{
+  hwloc_obj_t child, next_child = NULL;
+  /* These will always point to the pointer to their next last child. */
+  hwloc_obj_t *cur_children = &cur->first_child;
+  hwloc_obj_t *obj_children = &obj->first_child;
+  /* Pointer where OBJ should be put */
+  hwloc_obj_t *putp = NULL; /* OBJ position isn't found yet */
+
+  /* Make sure we haven't gone too deep.  */
+  if (!hwloc_bitmap_isincluded(obj->cpuset, cur->cpuset)) {
+    fprintf(stderr,"recursion has gone too deep?!\n");
+    return NULL;
+  }
+
+  /* Iteration with prefetching to be completely safe against CHILD removal.
+   * The list is already sorted by cpuset, and there's no intersection between siblings.
+   */
+  for (child = cur->first_child, child ? next_child = child->next_sibling : NULL;
+       child;
+       child = next_child, child ? next_child = child->next_sibling : NULL) {
+
+    int res = hwloc_obj_cmp_sets(obj, child);
+
+    if (res == HWLOC_OBJ_EQUAL) {
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	/* Group are ignored keep_structure. ignored always are handled earlier. Non-ignored Groups isn't possible. */
+	assert(topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE);
+        /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
+	 * while some callers need to know (at least hwloc_topology_insert_group()).
+	 *
+	 * Keep EQUAL so that the Group gets merged.
+	 */
+      } else {
+	/* otherwise compare actual types to decide of the inclusion */
+	res = hwloc_obj_cmp_types(obj, child);
+      }
+    }
+
+    switch (res) {
+      case HWLOC_OBJ_EQUAL:
+	/* Can be two objects with same type. Or one Group and anything else. */
+	if (obj->type == child->type
+	    && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)
+	    && obj->os_index != child->os_index) {
+	  static int reported = 0;
+	  if (!reported && !hwloc_hide_errors()) {
+	    fprintf(stderr, "Cannot merge similar %s objects with different OS indexes %u and %u\n",
+		    hwloc_obj_type_string(obj->type), child->os_index, obj->os_index);
+	    reported = 1;
+	  }
+          return NULL;
+	}
+	merge_insert_equal(obj, child);
+	/* Already present, no need to insert.  */
+	return child;
+
+      case HWLOC_OBJ_INCLUDED:
+	/* OBJ is strictly contained is some child of CUR, go deeper.  */
+	return hwloc___insert_object_by_cpuset(topology, child, obj, report_error);
+
+      case HWLOC_OBJ_INTERSECTS:
+        if (report_error) {
+	  char childstr[512];
+	  char objstr[512];
+	  char msg[1024];
+	  hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
+	  hwloc__report_error_format_obj(childstr, sizeof(childstr), child);
+	  snprintf(msg, sizeof(msg), "%s intersects with %s without inclusion!", objstr, childstr);
+	  report_error(msg, __LINE__);
+	}
+	goto putback;
+
+      case HWLOC_OBJ_DIFFERENT:
+        /* OBJ should be a child of CUR before CHILD, mark its position if not found yet. */
+	if (!putp && hwloc__object_cpusets_compare_first(obj, child) < 0)
+	  /* Don't insert yet, there could be intersect errors later */
+	  putp = cur_children;
+	/* Advance cur_children.  */
+	cur_children = &child->next_sibling;
+	break;
+
+      case HWLOC_OBJ_CONTAINS:
+	/* OBJ contains CHILD, remove CHILD from CUR */
+	*cur_children = child->next_sibling;
+	child->next_sibling = NULL;
+	/* Put CHILD in OBJ */
+	*obj_children = child;
+	obj_children = &child->next_sibling;
+	child->parent = obj;
+	break;
+    }
+  }
+  /* cur/obj_children points to last CUR/OBJ child next_sibling pointer, which must be NULL. */
+  assert(!*obj_children);
+  assert(!*cur_children);
+
+  /* Put OBJ where it belongs, or in last in CUR's children.  */
+  if (!putp)
+    putp = cur_children;
+  obj->next_sibling = *putp;
+  *putp = obj;
+  obj->parent = cur;
+
+  topology->modified = 1;
+  return obj;
+
+ putback:
+  /* Put-back OBJ children in CUR and return an error. */
+  if (putp)
+    cur_children = putp; /* No need to try to insert before where OBJ was supposed to go */
+  else
+    cur_children = &cur->first_child; /* Start from the beginning */
+  /* We can insert in order, but there can be holes in the middle. */
+  while ((child = obj->first_child) != NULL) {
+    /* Remove from OBJ */
+    obj->first_child = child->next_sibling;
+    obj->parent = cur;
+    /* Find child position in CUR, and insert. */
+    while (*cur_children && hwloc__object_cpusets_compare_first(*cur_children, child) < 0)
+      cur_children = &(*cur_children)->next_sibling;
+    child->next_sibling = *cur_children;
+    *cur_children = child;
+  }
+  return NULL;
+}
+
+/* insertion routine that lets you change the error reporting callback */
+struct hwloc_obj *
+hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj,
+			       hwloc_report_error_t report_error)
+{
+  struct hwloc_obj *result;
+
+  assert(!hwloc_obj_type_is_special(obj->type));
+
+  /* Start at the top.  */
+  result = hwloc___insert_object_by_cpuset(topology, topology->levels[0][0], obj, report_error);
+  if (result != obj) {
+    /* either failed to insert, or got merged, free the original object */
+    hwloc_free_unlinked_object(obj);
+  } else {
+    /* Add the cpuset to the top */
+    hwloc_bitmap_or(topology->levels[0][0]->complete_cpuset, topology->levels[0][0]->complete_cpuset, obj->cpuset);
+    if (obj->nodeset)
+      hwloc_bitmap_or(topology->levels[0][0]->complete_nodeset, topology->levels[0][0]->complete_nodeset, obj->nodeset);
+  }
+  return result;
+}
+
+/* the default insertion routine warns in case of error.
+ * it's used by most backends */
+struct hwloc_obj *
+hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  return hwloc__insert_object_by_cpuset(topology, obj, hwloc_report_os_error);
+}
+
+void
+hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj)
+{
+  hwloc_obj_t *current;
+
+  if (obj->type == HWLOC_OBJ_MISC) {
+    /* Append to the end of the Misc list */
+    for (current = &parent->misc_first_child; *current; current = &(*current)->next_sibling);
+  } else if (hwloc_obj_type_is_io(obj->type)) {
+    /* Append to the end of the I/O list */
+    for (current = &parent->io_first_child; *current; current = &(*current)->next_sibling);
+  } else {
+    /* Append to the end of the list.
+     * The caller takes care of inserting children in the right cpuset order, without intersection between them.
+     * Duplicating doesn't need to check the order since the source topology is supposed to be OK already.
+     * XML reorders if needed, and fails on intersecting siblings.
+     * Other callers just insert random objects such as I/O or Misc, no cpuset issue there.
+     */
+    for (current = &parent->first_child; *current; current = &(*current)->next_sibling);
+  }
+
+  *current = obj;
+  obj->parent = parent;
+  obj->next_sibling = NULL;
+  topology->modified = 1;
+}
+
+hwloc_obj_t
+hwloc_topology_alloc_group_object(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+  hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+  if (!obj)
+    return NULL;
+  obj->attr->group.depth = -1;
+  return obj;
+}
+
+hwloc_obj_t
+hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t res;
+
+  if (!topology->is_loaded) {
+    /* this could actually work, we would just need to disable connect_children/levels below */
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_ALWAYS) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if ((!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset))
+      && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))
+      && (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+      && (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  res = hwloc__insert_object_by_cpuset(topology, obj, NULL /* do not show errors on stdout */);
+  if (!res)
+    return NULL;
+  if (res != obj)
+    /* merged */
+    return res;
+
+  /* properly inserted */
+  hwloc_obj_add_children_sets(obj);
+  hwloc_connect_children(topology->levels[0][0]);
+  if (hwloc_connect_levels(topology) < 0)
+    return NULL;
+  topology->modified = 0;
+  return obj;
+}
+
+static void hwloc_connect_misc_level(hwloc_topology_t topology);
+
+hwloc_obj_t
+hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t parent, const char *name)
+{
+  hwloc_obj_t obj;
+
+  if (topology->ignored_types[HWLOC_OBJ_MISC] == HWLOC_IGNORE_TYPE_ALWAYS) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  obj = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, -1);
+  if (name)
+    obj->name = strdup(name);
+
+  hwloc_insert_object_by_parent(topology, parent, obj);
+
+  hwloc_connect_children(parent); /* FIXME: only connect misc children */
+  hwloc_connect_misc_level(topology);
+  topology->modified = 0;
+
+  return obj;
+}
+
+static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+{
+  const struct hwloc_obj_memory_page_type_s *a = _a;
+  const struct hwloc_obj_memory_page_type_s *b = _b;
+  /* consider 0 as larger so that 0-size page_type go to the end */
+  if (!b->size)
+    return -1;
+  /* don't cast a-b in int since those are ullongs */
+  if (b->size == a->size)
+    return 0;
+  return a->size < b->size ? -1 : 1;
+}
+
+/* Propagate memory counts */
+static void
+propagate_total_memory(hwloc_obj_t obj)
+{
+  hwloc_obj_t *temp, child;
+  unsigned i;
+
+  /* reset total before counting local and children memory */
+  obj->memory.total_memory = 0;
+
+  /* Propagate memory up. */
+  for_each_child_safe(child, obj, temp) {
+    propagate_total_memory(child);
+    obj->memory.total_memory += child->memory.total_memory;
+  }
+  /* No memory under I/O or Misc */
+
+  obj->memory.total_memory += obj->memory.local_memory;
+
+  /* By the way, sort the page_type array.
+   * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
+   */
+  qsort(obj->memory.page_types, obj->memory.page_types_len, sizeof(*obj->memory.page_types), hwloc_memory_page_type_compare);
+  /* Ignore 0-size page_types, they are at the end */
+  for(i=obj->memory.page_types_len; i>=1; i--)
+    if (obj->memory.page_types[i-1].size)
+      break;
+  obj->memory.page_types_len = i;
+}
+
+/* Collect the cpuset of all the PU objects. */
+static void
+collect_proc_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+
+  if (sys) {
+    /* We are already given a pointer to a system object */
+    if (obj->type == HWLOC_OBJ_PU)
+      hwloc_bitmap_or(sys->cpuset, sys->cpuset, obj->cpuset);
+  } else {
+    if (obj->cpuset) {
+      /* This object is the root of a machine */
+      sys = obj;
+      /* Assume no PU for now */
+      hwloc_bitmap_zero(obj->cpuset);
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    collect_proc_cpuset(child, sys);
+  /* No PU under I/O or Misc */
+}
+
+/* While traversing down and up, propagate the disallowed cpus by
+ * and'ing them to and from the first object that has a cpuset */
+static void
+propagate_unused_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->cpuset) {
+    if (sys) {
+      /* We are already given a pointer to an system object, update it and update ourselves */
+      hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+
+      /* Apply the topology cpuset */
+      hwloc_bitmap_and(obj->cpuset, obj->cpuset, sys->cpuset);
+
+      /* Update complete cpuset down */
+      if (obj->complete_cpuset) {
+	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, sys->complete_cpuset);
+      } else {
+	obj->complete_cpuset = hwloc_bitmap_dup(sys->complete_cpuset);
+	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, obj->cpuset);
+      }
+
+      /* Update allowed cpusets */
+      if (obj->allowed_cpuset) {
+	/* Update ours */
+	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, sys->allowed_cpuset);
+
+	/* Update the given cpuset, but only what we know */
+	hwloc_bitmap_copy(mask, obj->cpuset);
+	hwloc_bitmap_not(mask, mask);
+	hwloc_bitmap_or(mask, mask, obj->allowed_cpuset);
+	hwloc_bitmap_and(sys->allowed_cpuset, sys->allowed_cpuset, mask);
+      } else {
+	/* Just take it as such */
+	obj->allowed_cpuset = hwloc_bitmap_dup(sys->allowed_cpuset);
+	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->cpuset);
+      }
+
+      hwloc_bitmap_free(mask);
+    } else {
+      /* This object is the root of a machine */
+      sys = obj;
+      /* Apply complete_cpuset to cpuset and allowed_cpuset, it
+       * will automatically be applied below */
+      if (obj->complete_cpuset)
+        hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->complete_cpuset);
+      else
+        obj->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+      if (obj->allowed_cpuset)
+        hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->complete_cpuset);
+      else
+        obj->allowed_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    propagate_unused_cpuset(child, sys);
+  /* No PU under I/O or Misc */
+}
+
+/* Setup object cpusets/nodesets by OR'ing its children. */
+HWLOC_DECLSPEC int
+hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
+{
+#define ADD_OTHER_OBJ_SET(_dst, _src, _set)			\
+  if ((_src)->_set) {						\
+    if (!(_dst)->_set)						\
+      (_dst)->_set = hwloc_bitmap_alloc();			\
+    hwloc_bitmap_or((_dst)->_set, (_dst)->_set, (_src)->_set);	\
+  }
+  ADD_OTHER_OBJ_SET(dst, src, cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, allowed_cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, nodeset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_nodeset);
+  ADD_OTHER_OBJ_SET(dst, src, allowed_nodeset);
+  return 0;
+}
+
+HWLOC_DECLSPEC int
+hwloc_obj_add_children_sets(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  assert(obj->cpuset != NULL);
+  child = obj->first_child;
+  while (child) {
+    assert(child->cpuset != NULL);
+    hwloc_obj_add_other_obj_sets(obj, child);
+    child = child->next_sibling;
+  }
+  /* No need to look at Misc children, they contain no PU. */
+  return 0;
+}
+
+/* Propagate nodesets up and down */
+static void
+propagate_nodeset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+  hwloc_bitmap_t parent_nodeset = NULL;
+  int parent_weight = 0;
+
+  if (!sys && obj->nodeset) {
+    sys = obj;
+    if (!obj->complete_nodeset)
+      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+    if (!obj->allowed_nodeset)
+      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+  }
+
+  if (sys) {
+    if (obj->nodeset) {
+      /* Some existing nodeset coming from above, to possibly propagate down */
+      parent_nodeset = obj->nodeset;
+      parent_weight = hwloc_bitmap_weight(parent_nodeset);
+    } else
+      obj->nodeset = hwloc_bitmap_alloc();
+  }
+
+  for_each_child_safe(child, obj, temp) {
+    /* Propagate singleton nodesets down */
+    if (parent_weight == 1) {
+      if (!child->nodeset)
+        child->nodeset = hwloc_bitmap_dup(obj->nodeset);
+      else if (!hwloc_bitmap_isequal(child->nodeset, parent_nodeset)) {
+        hwloc_debug_bitmap("Oops, parent nodeset %s", parent_nodeset);
+        hwloc_debug_bitmap(" is different from child nodeset %s, ignoring the child one\n", child->nodeset);
+        hwloc_bitmap_copy(child->nodeset, parent_nodeset);
+      }
+    }
+
+    /* Recurse */
+    propagate_nodeset(child, sys);
+
+    /* Propagate children nodesets up */
+    if (sys && child->nodeset)
+      hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+  }
+  /* No nodeset under I/O or Misc */
+}
+
+/* Propagate allowed and complete nodesets */
+static void
+propagate_nodesets(hwloc_obj_t obj)
+{
+  hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+  hwloc_obj_t child, *temp;
+
+  for_each_child_safe(child, obj, temp) {
+    if (obj->nodeset) {
+      /* Update complete nodesets down */
+      if (child->complete_nodeset) {
+        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
+      } else if (child->nodeset) {
+        child->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, child->nodeset);
+      } /* else the child doesn't have nodeset information, we can not provide a complete nodeset */
+
+      /* Update allowed nodesets down */
+      if (child->allowed_nodeset) {
+        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, obj->allowed_nodeset);
+      } else if (child->nodeset) {
+        child->allowed_nodeset = hwloc_bitmap_dup(obj->allowed_nodeset);
+        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, child->nodeset);
+      }
+    }
+
+    propagate_nodesets(child);
+
+    if (obj->nodeset) {
+      /* Update allowed nodesets up */
+      if (child->nodeset && child->allowed_nodeset) {
+        hwloc_bitmap_copy(mask, child->nodeset);
+        hwloc_bitmap_andnot(mask, mask, child->allowed_nodeset);
+        hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, mask);
+      }
+    }
+  }
+  hwloc_bitmap_free(mask);
+  /* No nodeset under I/O or Misc */
+
+  if (obj->nodeset) {
+    /* Apply complete nodeset to nodeset and allowed_nodeset */
+    if (obj->complete_nodeset)
+      hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->complete_nodeset);
+    else
+      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+    if (obj->allowed_nodeset)
+      hwloc_bitmap_and(obj->allowed_nodeset, obj->allowed_nodeset, obj->complete_nodeset);
+    else
+      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+  }
+}
+
+static void
+remove_unused_sets(hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->cpuset) {
+    hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->allowed_cpuset);
+  }
+  if (obj->nodeset) {
+    hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->allowed_nodeset);
+  }
+  if (obj->type == HWLOC_OBJ_NUMANODE && obj->os_index != (unsigned) -1 &&
+      !hwloc_bitmap_isset(obj->allowed_nodeset, obj->os_index)) {
+    unsigned i;
+    hwloc_debug("Dropping memory from disallowed node %u\n", obj->os_index);
+    obj->memory.local_memory = 0;
+    obj->memory.total_memory = 0;
+    for(i=0; i<obj->memory.page_types_len; i++)
+      obj->memory.page_types[i].count = 0;
+  }
+
+  for_each_child_safe(child, obj, temp)
+    remove_unused_sets(child);
+  /* No cpuset under I/O or Misc */
+}
+
+void
+hwloc__reorder_children(hwloc_obj_t parent)
+{
+  /* move the children list on the side */
+  hwloc_obj_t *prev, child, children = parent->first_child;
+  parent->first_child = NULL;
+  while (children) {
+    /* dequeue child */
+    child = children;
+    children = child->next_sibling;
+    /* find where to enqueue it */
+    prev = &parent->first_child;
+    while (*prev && hwloc__object_cpusets_compare_first(child, *prev) > 0)
+      prev = &((*prev)->next_sibling);
+    /* enqueue */
+    child->next_sibling = *prev;
+    *prev = child;
+  }
+  /* No ordering to enforce for Misc children. */
+}
+
+/* Remove objects that are ignored in any case.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_always(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+  hwloc_obj_t parent = *pparent, child, *pchild;
+  int dropped_children = 0;
+  int dropped = 0;
+
+  /* account dropped normal children only, others don't required reordering */
+  for_each_child_safe(child, parent, pchild)
+    dropped_children += ignore_type_always(topology, pchild);
+  for_each_io_child_safe(child, parent, pchild) /* There can be Misc under I/O */
+    ignore_type_always(topology, pchild);
+  for_each_misc_child_safe(child, parent, pchild)
+    ignore_type_always(topology, pchild);
+
+  if ((parent != topology->levels[0][0] &&
+       topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_ALWAYS)
+      || (parent->type == HWLOC_OBJ_CACHE && parent->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION
+	  && !(topology->flags & HWLOC_TOPOLOGY_FLAG_ICACHES))) {
+    hwloc_debug("%s", "\nDropping ignored object ");
+    hwloc_debug_print_object(0, parent);
+    unlink_and_free_single_object(pparent);
+    topology->modified = 1;
+    dropped = 1;
+
+  } else if (dropped_children) {
+    /* we keep this object but its children changed, reorder them by complete_cpuset */
+    hwloc__reorder_children(parent);
+  }
+
+  return dropped;
+}
+
+/* Remove all children whose cpuset is empty, except NUMA nodes
+ * since we want to keep memory information, and except PCI bridges and devices.
+ */
+static void
+remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    remove_empty(topology, pchild);
+  /* No cpuset under I/O or Misc */
+
+  if (obj->type != HWLOC_OBJ_NUMANODE
+      && !obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
+      && !obj->io_first_child /* only remove if no I/O is attached there */
+      && hwloc_bitmap_iszero(obj->cpuset)) {
+    /* Remove empty children (even if it has Misc children) */
+    hwloc_debug("%s", "\nRemoving empty object ");
+    hwloc_debug_print_object(0, obj);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+  }
+}
+
+/* Remove objects that are ignored with keep structure flag.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+  hwloc_obj_t parent = *pparent, child, *pchild;
+  int replacechild = 0, replaceparent = 0, droppedchildren = 0;
+
+  if (!parent->first_child) /* can't use arity yet */
+    /* There are no children, nothing to merge. */
+    return 0;
+
+  /* account dropped normal children only, others don't required reordering */
+  for_each_child_safe(child, parent, pchild)
+    droppedchildren += ignore_type_keep_structure(topology, pchild);
+  for_each_io_child_safe(child, parent, pchild)
+    ignore_type_keep_structure(topology, pchild);
+  for_each_misc_child_safe(child, parent, pchild)
+    ignore_type_keep_structure(topology, pchild);
+
+  if (droppedchildren)
+    hwloc__reorder_children(parent);
+
+  child = parent->first_child;
+  /* we don't merge if there are multiple "important" children. */
+  if (child->next_sibling) /* can't use arity yet */
+    return 0;
+
+  /* Check whether parent and/or child can be replaced */
+  if (topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+    /* Parent can be ignored in favor of the child.  */
+    replaceparent = 1;
+  }
+  if (topology->ignored_types[child->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+    /* Child can be ignored in favor of the parent.  */
+    replacechild = 1;
+  }
+
+  /* Decide which one to actually replace */
+  if (replaceparent && replacechild) {
+    /* If both may be replaced, look at obj_type_priority */
+    if (obj_type_priority[parent->type] > obj_type_priority[child->type])
+      replaceparent = 0;
+    else
+      replacechild = 0;
+  }
+
+  if (replaceparent) {
+    /* Replace parent with child */
+    hwloc_debug("%s", "\nIgnoring parent ");
+    hwloc_debug_print_object(0, parent);
+    /* move children to child, so that unlink_and_free_single_object() doesn't move them to the grandparent */
+    if (parent->io_first_child) {
+      append_siblings_list(&child->io_first_child, parent->io_first_child, child);
+      parent->io_first_child = NULL;
+    }
+    if (parent->misc_first_child) {
+      append_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
+      parent->misc_first_child = NULL;
+    }
+    unlink_and_free_single_object(pparent);
+    topology->modified = 1;
+
+  } else if (replacechild) {
+    /* Replace child with parent */
+    hwloc_debug("%s", "\nIgnoring child ");
+    hwloc_debug_print_object(0, child);
+    unlink_and_free_single_object(&parent->first_child);
+    topology->modified = 1;
+  }
+
+  return replaceparent ? 1 : 0;
+}
+
+static void
+hwloc_drop_all_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *pchild;
+  for_each_child_safe(child, root, pchild) {
+    hwloc_drop_all_io(topology, child);
+  }
+  for_each_io_child_safe(child, root, pchild) {
+    unlink_and_free_object_and_children(pchild);
+    topology->modified = 1;
+  }
+  /* No I/O under Misc */
+}
+
+/*
+ * If IO_DEVICES and WHOLE_IO are not set, we drop everything.
+ * If WHOLE_IO is not set, we drop non-interesting devices,
+ * and bridges that have no children.
+ * If IO_BRIDGES is also not set, we also drop all bridges
+ * except the hostbridges.
+ */
+static void
+hwloc_drop_useless_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *pchild;
+
+  /* recurse into normal children */
+  for_each_child_safe(child, root, pchild) {
+    hwloc_drop_useless_io(topology, child);
+  }
+
+  /* filter I/O children and recurse */
+  for_each_io_child_safe(child, root, pchild) {
+    /* remove useless children if needed */
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_IO)
+	&& child->type == HWLOC_OBJ_PCI_DEVICE) {
+      unsigned classid = child->attr->pcidev.class_id;
+      unsigned baseclass = classid >> 8;
+      if (baseclass != 0x03 /* PCI_BASE_CLASS_DISPLAY */
+	  && baseclass != 0x02 /* PCI_BASE_CLASS_NETWORK */
+	  && baseclass != 0x01 /* PCI_BASE_CLASS_STORAGE */
+	  && baseclass != 0x0b /* PCI_BASE_CLASS_PROCESSOR */
+	  && classid != 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
+	  && baseclass != 0x12 /* Processing Accelerators */) {
+	unlink_and_free_object_and_children(pchild);
+	topology->modified = 1;
+	continue;
+      }
+    }
+    /* recurse to ignore grand-children etc */
+    hwloc_drop_useless_io(topology, child);
+    /* now remove useless bridges if needed */
+    if (child->type == HWLOC_OBJ_BRIDGE) {
+      if (!child->io_first_child) {
+	/* bridges with no children are removed if WHOLE_IO isn't given */
+	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+	  unlink_and_free_single_object(pchild);
+	  topology->modified = 1;
+	  continue;
+	}
+      } else if (child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_HOST) {
+	/* only hostbridges are kept if WHOLE_IO or IO_BRIDGE are not given */
+	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_BRIDGES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+	  unlink_and_free_single_object(pchild);
+	  topology->modified = 1;
+	  continue;
+	}
+      }
+    }
+  }
+
+  /* No I/O under Misc */
+}
+
+static void
+hwloc_propagate_bridge_depth(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
+{
+  hwloc_obj_t child;
+  for(child = root->first_child; child; child = child->next_sibling) {
+    assert(!depth); /* no normal children under I/O */
+    hwloc_propagate_bridge_depth(topology, child, 0);
+  }
+  for(child = root->io_first_child; child; child = child->next_sibling) {
+    if (child->type == HWLOC_OBJ_BRIDGE) {
+      child->attr->bridge.depth = depth;
+      hwloc_propagate_bridge_depth(topology, child, depth+1);
+    } else if (!hwloc_obj_type_is_io(child->type)) {
+      hwloc_propagate_bridge_depth(topology, child, 0);
+    }
+  }
+  /* No I/O under Misc children */
+}
+
+static void
+hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *array;
+  int ok;
+
+  /* assume we're not symmetric by default */
+  root->symmetric_subtree = 0;
+
+  /* if no child, we are symmetric */
+  if (!root->arity) {
+    root->symmetric_subtree = 1;
+    return;
+  }
+
+  /* look at normal children only, I/O and Misc are ignored.
+   * return if any child is not symmetric.
+   */
+  ok = 1;
+  for(child = root->first_child; child; child = child->next_sibling) {
+    hwloc_propagate_symmetric_subtree(topology, child);
+    if (!child->symmetric_subtree)
+      ok = 0;
+  }
+  if (!ok)
+    return;
+  /* Misc and I/O children do not care about symmetric_subtree */
+
+  /* now check that children subtrees are identical.
+   * just walk down the first child in each tree and compare their depth and arities
+   */
+  array = malloc(root->arity * sizeof(*array));
+  memcpy(array, root->children, root->arity * sizeof(*array));
+  while (1) {
+    unsigned i;
+    /* check current level arities and depth */
+    for(i=1; i<root->arity; i++)
+      if (array[i]->depth != array[0]->depth
+	  || array[i]->arity != array[0]->arity) {
+      free(array);
+      return;
+    }
+    if (!array[0]->arity)
+      /* no more children level, we're ok */
+      break;
+    /* look at first child of each element now */
+    for(i=0; i<root->arity; i++)
+      array[i] = array[i]->first_child;
+  }
+  free(array);
+
+  /* everything went fine, we're symmetric */
+  root->symmetric_subtree = 1;
+}
+
+/*
+ * Initialize handy pointers in the whole topology.
+ * The topology only had first_child and next_sibling pointers.
+ * When this funtions return, all parent/children pointers are initialized.
+ * The remaining fields (levels, cousins, logical_index, depth, ...) will
+ * be setup later in hwloc_connect_levels().
+ *
+ * Can be called several times, so may have to update the array.
+ */
+void
+hwloc_connect_children(hwloc_obj_t parent)
+{
+  unsigned n, oldn = parent->arity;
+  hwloc_obj_t child, prev_child;
+  int ok;
+
+  /* Main children list */
+
+  ok = 1;
+  prev_child = NULL;
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    /* already OK in the array? */
+    if (n >= oldn || parent->children[n] != child)
+      ok = 0;
+    /* recurse */
+    hwloc_connect_children(child);
+  }
+  parent->last_child = prev_child;
+  parent->arity = n;
+  if (!n) {
+    /* no need for an array anymore */
+    free(parent->children);
+    parent->children = NULL;
+    goto io;
+  }
+  if (ok)
+    /* array is already OK (even if too large) */
+    goto io;
+
+  /* alloc a larger array if needed */
+  if (oldn < n) {
+    free(parent->children);
+    parent->children = malloc(n * sizeof(*parent->children));
+  }
+  /* refill */
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   child = child->next_sibling) {
+    parent->children[n] = child;
+  }
+
+  /* Misc children list */
+ io:
+
+  prev_child = NULL;
+  for (n = 0, child = parent->io_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->io_arity = n;
+
+  /* Misc children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->misc_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->misc_arity = n;
+}
+
+/*
+ * Check whether there is an object below ROOT that has the same type as OBJ
+ */
+static int
+find_same_type(hwloc_obj_t root, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  if (hwloc_type_cmp(root, obj) == HWLOC_TYPE_EQUAL)
+    return 1;
+
+  for (child = root->first_child; child; child = child->next_sibling)
+    if (find_same_type(child, obj))
+      return 1;
+
+  return 0;
+}
+
+/* traverse the array of current object and compare them with top_obj.
+ * if equal, take the object and put its children into the remaining objs.
+ * if not equal, put the object into the remaining objs.
+ */
+static int
+hwloc_level_take_objects(hwloc_obj_t top_obj,
+			 hwloc_obj_t *current_objs, unsigned n_current_objs,
+			 hwloc_obj_t *taken_objs, unsigned n_taken_objs __hwloc_attribute_unused,
+			 hwloc_obj_t *remaining_objs, unsigned n_remaining_objs __hwloc_attribute_unused)
+{
+  unsigned taken_i = 0;
+  unsigned new_i = 0;
+  unsigned i, j;
+
+  for (i = 0; i < n_current_objs; i++)
+    if (hwloc_type_cmp(top_obj, current_objs[i]) == HWLOC_TYPE_EQUAL) {
+      /* Take it, add main children.  */
+      taken_objs[taken_i++] = current_objs[i];
+      for (j = 0; j < current_objs[i]->arity; j++)
+	remaining_objs[new_i++] = current_objs[i]->children[j];
+    } else {
+      /* Leave it.  */
+      remaining_objs[new_i++] = current_objs[i];
+    }
+
+#ifdef HWLOC_DEBUG
+  /* Make sure we didn't mess up.  */
+  assert(taken_i == n_taken_objs);
+  assert(new_i == n_current_objs - n_taken_objs + n_remaining_objs);
+#endif
+
+  return new_i;
+}
+
+static unsigned
+hwloc_build_level_from_list(struct hwloc_obj *first, struct hwloc_obj ***levelp)
+{
+  unsigned i, nb;
+  struct hwloc_obj * obj;
+
+  /* count */
+  obj = first;
+  i = 0;
+  while (obj) {
+    i++;
+    obj = obj->next_cousin;
+  }
+  nb = i;
+
+  /* allocate and fill level */
+  *levelp = malloc(nb * sizeof(struct hwloc_obj *));
+  obj = first;
+  i = 0;
+  while (obj) {
+    obj->logical_index = i;
+    (*levelp)[i] = obj;
+    i++;
+    obj = obj->next_cousin;
+  }
+
+  return nb;
+}
+
+/* Append I/O objects to their lists */
+static void
+hwloc_list_io_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (hwloc_obj_type_is_io(obj->type)) {
+    /* make sure we don't have remaining stale pointers from a previous load */
+    obj->next_cousin = NULL;
+    obj->prev_cousin = NULL;
+
+    if (obj->type == HWLOC_OBJ_BRIDGE) {
+      obj->depth = HWLOC_TYPE_DEPTH_BRIDGE;
+      /* Insert in the main bridge list */
+      if (topology->first_bridge) {
+	obj->prev_cousin = topology->last_bridge;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_bridge = obj;
+      } else {
+	topology->first_bridge = topology->last_bridge = obj;
+      }
+
+    } else if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+      /* Insert in the main pcidev list */
+      if (topology->first_pcidev) {
+	obj->prev_cousin = topology->last_pcidev;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_pcidev = obj;
+      } else {
+	topology->first_pcidev = topology->last_pcidev = obj;
+      }
+
+    } else if (obj->type == HWLOC_OBJ_OS_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_OS_DEVICE;
+      /* Insert in the main osdev list */
+      if (topology->first_osdev) {
+	obj->prev_cousin = topology->last_osdev;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_osdev = obj;
+      } else {
+	topology->first_osdev = topology->last_osdev = obj;
+      }
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    hwloc_list_io_objects(topology, child);
+  for_each_io_child_safe(child, obj, temp)
+    hwloc_list_io_objects(topology, child);
+  /* No I/O under Misc */
+}
+
+/* Build I/O levels */
+static void
+hwloc_connect_io_levels(hwloc_topology_t topology)
+{
+  free(topology->bridge_level);
+  topology->bridge_level = NULL;
+  topology->bridge_nbobjects = 0;
+  topology->first_bridge = topology->last_bridge = NULL;
+  topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
+
+  free(topology->pcidev_level);
+  topology->pcidev_level = NULL;
+  topology->pcidev_nbobjects = 0;
+  topology->first_pcidev = topology->last_pcidev = NULL;
+  topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+
+  free(topology->osdev_level);
+  topology->osdev_level = NULL;
+  topology->osdev_nbobjects = 0;
+  topology->first_osdev = topology->last_osdev = NULL;
+  topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+
+  hwloc_list_io_objects(topology, topology->levels[0][0]);
+  topology->bridge_nbobjects = hwloc_build_level_from_list(topology->first_bridge, &topology->bridge_level);
+  topology->pcidev_nbobjects = hwloc_build_level_from_list(topology->first_pcidev, &topology->pcidev_level);
+  topology->osdev_nbobjects = hwloc_build_level_from_list(topology->first_osdev, &topology->osdev_level);
+}
+
+/* Append Misc object to their list */
+static void
+hwloc_list_misc_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->type == HWLOC_OBJ_MISC) {
+    obj->depth = HWLOC_TYPE_DEPTH_MISC;
+    /* Insert the main Misc list */
+    if (topology->first_misc) {
+      obj->prev_cousin = topology->last_misc;
+      obj->prev_cousin->next_cousin = obj;
+      topology->last_misc = obj;
+    } else {
+      topology->first_misc = topology->last_misc = obj;
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+  for_each_io_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+  for_each_misc_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+}
+
+/* Build Misc level */
+static void
+hwloc_connect_misc_level(hwloc_topology_t topology)
+{
+  free(topology->misc_level);
+  topology->misc_level = NULL;
+  topology->misc_nbobjects = 0;
+  topology->first_misc = topology->last_misc = NULL;
+  topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
+
+  hwloc_list_misc_objects(topology, topology->levels[0][0]);
+  topology->misc_nbobjects = hwloc_build_level_from_list(topology->first_misc, &topology->misc_level);
+}
+
+/*
+ * Do the remaining work that hwloc_connect_children() did not do earlier.
+ */
+int
+hwloc_connect_levels(hwloc_topology_t topology)
+{
+  unsigned l, i=0;
+  hwloc_obj_t *objs, *taken_objs, *new_objs, top_obj, root;
+  unsigned n_objs, n_taken_objs, n_new_objs;
+
+  /* reset non-root levels (root was initialized during init and will not change here) */
+  for(l=1; l<HWLOC_DEPTH_MAX; l++)
+    free(topology->levels[l]);
+  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+  memset(topology->level_nbobjects+1, 0,  (HWLOC_DEPTH_MAX-1)*sizeof(*topology->level_nbobjects));
+  topology->nb_levels = 1;
+  /* don't touch next_group_depth, the Group objects are still here */
+
+  /* initialize all depth to unknown */
+  for (l = HWLOC_OBJ_SYSTEM; l < HWLOC_OBJ_TYPE_MAX; l++)
+    topology->type_depth[l] = HWLOC_TYPE_DEPTH_UNKNOWN;
+
+  /* initialize root type depth */
+  root = topology->levels[0][0];
+  root->depth = 0;
+  topology->type_depth[root->type] = 0;
+  /* root level */
+  root->logical_index = 0;
+  root->prev_cousin = NULL;
+  root->next_cousin = NULL;
+  /* root as a child of nothing */
+  root->parent = NULL;
+  root->sibling_rank = 0;
+  root->prev_sibling = NULL;
+  root->next_sibling = NULL;
+
+  /* Start with children of the whole system.  */
+  n_objs = topology->levels[0][0]->arity;
+  objs = malloc(n_objs * sizeof(objs[0]));
+  if (!objs) {
+    errno = ENOMEM;
+    return -1;
+  }
+  memcpy(objs, topology->levels[0][0]->children, n_objs*sizeof(objs[0]));
+
+  /* Keep building levels while there are objects left in OBJS.  */
+  while (n_objs) {
+    /* At this point, the objs array contains only objects that may go into levels */
+
+    /* First find which type of object is the topmost.
+     * Don't use PU if there are other types since we want to keep PU at the bottom.
+     */
+
+    /* Look for the first non-PU object, and use the first PU if we really find nothing else */
+    for (i = 0; i < n_objs; i++)
+      if (objs[i]->type != HWLOC_OBJ_PU)
+        break;
+    top_obj = i == n_objs ? objs[0] : objs[i];
+
+    /* See if this is actually the topmost object */
+    for (i = 0; i < n_objs; i++) {
+      if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_TYPE_EQUAL) {
+	if (find_same_type(objs[i], top_obj)) {
+	  /* OBJS[i] is strictly above an object of the same type as TOP_OBJ, so it
+	   * is above TOP_OBJ.  */
+	  top_obj = objs[i];
+	}
+      }
+    }
+
+    /* Now peek all objects of the same type, build a level with that and
+     * replace them with their children.  */
+
+    /* First count them.  */
+    n_taken_objs = 0;
+    n_new_objs = 0;
+    for (i = 0; i < n_objs; i++)
+      if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_TYPE_EQUAL) {
+	n_taken_objs++;
+	n_new_objs += objs[i]->arity;
+      }
+
+    /* New level.  */
+    taken_objs = malloc((n_taken_objs + 1) * sizeof(taken_objs[0]));
+    /* New list of pending objects.  */
+    if (n_objs - n_taken_objs + n_new_objs) {
+      new_objs = malloc((n_objs - n_taken_objs + n_new_objs) * sizeof(new_objs[0]));
+    } else {
+#ifdef HWLOC_DEBUG
+      assert(!n_new_objs);
+      assert(n_objs == n_taken_objs);
+#endif
+      new_objs = NULL;
+    }
+
+    n_new_objs = hwloc_level_take_objects(top_obj,
+					  objs, n_objs,
+					  taken_objs, n_taken_objs,
+					  new_objs, n_new_objs);
+
+    /* Ok, put numbers in the level and link cousins.  */
+    for (i = 0; i < n_taken_objs; i++) {
+      taken_objs[i]->depth = topology->nb_levels;
+      taken_objs[i]->logical_index = i;
+      if (i) {
+	taken_objs[i]->prev_cousin = taken_objs[i-1];
+	taken_objs[i-1]->next_cousin = taken_objs[i];
+      }
+    }
+    taken_objs[0]->prev_cousin = NULL;
+    taken_objs[n_taken_objs-1]->next_cousin = NULL;
+
+    /* One more level!  */
+    if (top_obj->type == HWLOC_OBJ_CACHE)
+      hwloc_debug("--- Cache level depth %u", top_obj->attr->cache.depth);
+    else
+      hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
+    hwloc_debug(" has number %u\n\n", topology->nb_levels);
+
+    if (topology->type_depth[top_obj->type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+      topology->type_depth[top_obj->type] = topology->nb_levels;
+    else
+      topology->type_depth[top_obj->type] = HWLOC_TYPE_DEPTH_MULTIPLE; /* mark as unknown */
+
+    taken_objs[n_taken_objs] = NULL;
+
+    topology->level_nbobjects[topology->nb_levels] = n_taken_objs;
+    topology->levels[topology->nb_levels] = taken_objs;
+
+    topology->nb_levels++;
+
+    free(objs);
+
+    /* Switch to new_objs */
+    objs = new_objs;
+    n_objs = n_new_objs;
+  }
+
+  /* It's empty now.  */
+  if (objs)
+    free(objs);
+
+  hwloc_connect_io_levels(topology);
+  hwloc_connect_misc_level(topology);
+
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+
+  return 0;
+}
+
+void hwloc_alloc_obj_cpusets(hwloc_obj_t obj)
+{
+  if (!obj->cpuset)
+    obj->cpuset = hwloc_bitmap_alloc_full();
+  if (!obj->complete_cpuset)
+    obj->complete_cpuset = hwloc_bitmap_alloc();
+  if (!obj->allowed_cpuset)
+    obj->allowed_cpuset = hwloc_bitmap_alloc_full();
+  if (!obj->nodeset)
+    obj->nodeset = hwloc_bitmap_alloc();
+  if (!obj->complete_nodeset)
+    obj->complete_nodeset = hwloc_bitmap_alloc();
+  if (!obj->allowed_nodeset)
+    obj->allowed_nodeset = hwloc_bitmap_alloc_full();
+}
+
+/* Main discovery loop */
+static int
+hwloc_discover(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+  int gotsomeio = 0;
+  unsigned discoveries = 0;
+
+  topology->modified = 0; /* no need to reconnect yet */
+
+  /* discover() callbacks should use hwloc_insert to add objects initialized
+   * through hwloc_alloc_setup_object.
+   * For node levels, nodeset and memory must be initialized.
+   * For cache levels, memory and type/depth must be initialized.
+   * For group levels, depth must be initialized.
+   */
+
+  /* There must be at least a PU object for each logical processor, at worse
+   * produced by hwloc_setup_pu_level()
+   */
+
+  /* To be able to just use hwloc_insert_object_by_cpuset to insert the object
+   * in the topology according to the cpuset, the cpuset field must be
+   * initialized.
+   */
+
+  /* A priori, All processors are visible in the topology, and allowed
+   * for the application.
+   *
+   * - If some processors exist but topology information is unknown for them
+   *   (and thus the backend couldn't create objects for them), they should be
+   *   added to the complete_cpuset field of the lowest object where the object
+   *   could reside.
+   *
+   * - If some processors are not allowed for the application (e.g. for
+   *   administration reasons), they should be dropped from the allowed_cpuset
+   *   field.
+   *
+   * The same applies to the node sets complete_nodeset and allowed_cpuset.
+   *
+   * If such field doesn't exist yet, it can be allocated, and initialized to
+   * zero (for complete), or to full (for allowed). The values are
+   * automatically propagated to the whole tree after detection.
+   */
+
+  /*
+   * Discover CPUs first
+   */
+  backend = topology->backends;
+  while (NULL != backend) {
+    int err;
+    if (backend->component->type != HWLOC_DISC_COMPONENT_TYPE_CPU
+	&& backend->component->type != HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* not yet */
+      goto next_cpubackend;
+    if (!backend->discover)
+      goto next_cpubackend;
+
+    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+      hwloc_connect_children(topology->levels[0][0]);
+      if (hwloc_connect_levels(topology) < 0)
+	return -1;
+      topology->modified = 0;
+    }
+
+    err = backend->discover(backend);
+    if (err >= 0) {
+      if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+        gotsomeio += err;
+      discoveries++;
+    }
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_cpubackend:
+    backend = backend->next;
+  }
+
+  if (!discoveries) {
+    hwloc_debug("%s", "No CPU backend enabled or no discovery succeeded\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* Update objects cpusets and nodesets now that the CPU/GLOBAL backend populated PUs and nodes */
+
+  hwloc_debug("%s", "\nRestrict topology cpusets to existing PU and NODE objects\n");
+  collect_proc_cpuset(topology->levels[0][0], NULL);
+
+  hwloc_debug("%s", "\nPropagate disallowed cpus down and up\n");
+  propagate_unused_cpuset(topology->levels[0][0], NULL);
+
+  /* Backends must allocate root->*nodeset.
+   *
+   * Most of them call hwloc_alloc_obj_cpusets() on the root to do so.
+   * root->complete_nodeset is empty by default, and filled by the core
+   * when NUMA nodes are added with insert_by_cpuset().
+   * root->allowed_nodeset is everything by default, unless reduced by backends.
+   *
+   * The XML backend takes care of everything to properly support old XML input
+   * with missing nodesets and/or NUMA nodes. It checks nodesets and fix them if needed.
+   */
+  assert(topology->levels[0][0]->nodeset);
+  assert(topology->levels[0][0]->complete_nodeset);
+  assert(topology->levels[0][0]->allowed_nodeset);
+  /* If there's no NUMA node, add one with all the memory */
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset)) {
+    hwloc_obj_t node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+    node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset); /* requires root cpuset to be initialized above */
+    node->complete_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->complete_cpuset); /* requires root cpuset to be initialized above */
+    node->allowed_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->allowed_cpuset); /* requires root cpuset to be initialized above */
+    node->nodeset = hwloc_bitmap_alloc();
+    /* other nodesets will be filled below */
+    hwloc_bitmap_set(node->nodeset, 0);
+    memcpy(&node->memory, &topology->levels[0][0]->memory, sizeof(node->memory));
+    memset(&topology->levels[0][0]->memory, 0, sizeof(node->memory));
+    hwloc_insert_object_by_cpuset(topology, node);
+  }
+  hwloc_debug("%s", "\nPropagate nodesets\n");
+  propagate_nodeset(topology->levels[0][0], NULL);
+  propagate_nodesets(topology->levels[0][0]);
+
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+    hwloc_debug("%s", "\nRemoving unauthorized sets from all sets\n");
+    remove_unused_sets(topology->levels[0][0]);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+  }
+
+  /*
+   * All object cpusets and nodesets are properly set now.
+   */
+
+  /*
+   * Group levels by distances
+   */
+  hwloc_distances_finalize_os(topology);
+  hwloc_group_by_distances(topology);
+
+  /* Now connect handy pointers to make remaining discovery easier. */
+  hwloc_debug("%s", "\nOk, finished tweaking, now connect\n");
+  if (topology->modified) {
+    hwloc_connect_children(topology->levels[0][0]);
+    if (hwloc_connect_levels(topology) < 0)
+      return -1;
+    topology->modified = 0;
+  }
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /*
+   * Additional discovery with other backends
+   */
+
+  backend = topology->backends;
+  while (NULL != backend) {
+    int err;
+    if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_CPU
+	|| backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* already done above */
+      goto next_noncpubackend;
+    if (!backend->discover)
+      goto next_noncpubackend;
+
+    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+      hwloc_connect_children(topology->levels[0][0]);
+      if (hwloc_connect_levels(topology) < 0)
+	return -1;
+      topology->modified = 0;
+    }
+
+    err = backend->discover(backend);
+    if (err >= 0) {
+      gotsomeio += err;
+    }
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_noncpubackend:
+    backend = backend->next;
+  }
+
+  /* if we got anything, filter interesting objects and update the tree */
+  if (gotsomeio) {
+    if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+      /* drop all I/O children */
+      hwloc_drop_all_io(topology, topology->levels[0][0]);
+    else
+      hwloc_drop_useless_io(topology, topology->levels[0][0]);
+    hwloc_debug("%s", "\nNow reconnecting\n");
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+    hwloc_propagate_bridge_depth(topology, topology->levels[0][0], 0);
+  }
+
+  /* Remove some stuff */
+
+  hwloc_debug("%s", "\nRemoving ignored objects\n");
+  ignore_type_always(topology, &topology->levels[0][0]);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nRemoving empty objects except numa nodes and PCI devices\n");
+  remove_empty(topology, &topology->levels[0][0]);
+    if (!topology->levels[0][0]) {
+    fprintf(stderr, "Topology became empty, aborting!\n");
+    abort();
+  }
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nRemoving objects whose type has HWLOC_IGNORE_TYPE_KEEP_STRUCTURE and have only one child or are the only child\n");
+  ignore_type_keep_structure(topology, &topology->levels[0][0]);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /* Reconnect things after all these changes */
+  if (topology->modified) {
+    /* Often raised because of Groups inserted for I/Os */
+    hwloc_connect_children(topology->levels[0][0]);
+    if (hwloc_connect_levels(topology) < 0)
+      return -1;
+    topology->modified = 0;
+  }
+
+  /* accumulate children memory in total_memory fields (only once parent is set) */
+  hwloc_debug("%s", "\nPropagate total memory up\n");
+  propagate_total_memory(topology->levels[0][0]);
+
+  /*
+   * Now that objects are numbered, take distance matrices from backends and put them in the main topology.
+   *
+   * Some objects may have disappeared (in removed_empty or removed_ignored) since we setup os distances
+   * (hwloc_distances_finalize_os()) above. Reset them so as to not point to disappeared objects anymore.
+   */
+  hwloc_distances_restrict_os(topology);
+  hwloc_distances_finalize_os(topology);
+  hwloc_distances_finalize_logical(topology);
+
+  /* add some identification attributes if not loading from XML */
+  if (topology->backends
+      && strcmp(topology->backends->component->name, "xml")) {
+    char *value;
+    /* add a hwlocVersion */
+    hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", VERSION);
+    /* add a ProcessName */
+    value = hwloc_progname(topology);
+    if (value) {
+      hwloc_obj_add_info(topology->levels[0][0], "ProcessName", value);
+      free(value);
+    }
+  }
+
+  /*
+   * Now set binding hooks according to topology->is_thissystem
+   * what the native OS backend offers.
+   */
+  hwloc_set_binding_hooks(topology);
+
+  return 0;
+}
+
+/* To be before discovery is actually launched,
+ * Resets everything in case a previous load initialized some stuff.
+ */
+void
+hwloc_topology_setup_defaults(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *root_obj;
+
+  /* reset support */
+  memset(&topology->binding_hooks, 0, sizeof(topology->binding_hooks));
+  memset(topology->support.discovery, 0, sizeof(*topology->support.discovery));
+  memset(topology->support.cpubind, 0, sizeof(*topology->support.cpubind));
+  memset(topology->support.membind, 0, sizeof(*topology->support.membind));
+
+  /* Only the System object on top by default */
+  topology->nb_levels = 1; /* there's at least SYSTEM */
+  topology->next_group_depth = 0;
+  topology->levels[0] = malloc (sizeof (hwloc_obj_t));
+  topology->level_nbobjects[0] = 1;
+  /* NULLify other levels so that we can detect and free old ones in hwloc_connect_levels() if needed */
+  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+  topology->bridge_level = NULL;
+  topology->pcidev_level = NULL;
+  topology->osdev_level = NULL;
+  topology->first_bridge = topology->last_bridge = NULL;
+  topology->first_pcidev = topology->last_pcidev = NULL;
+  topology->first_osdev = topology->last_osdev = NULL;
+  topology->misc_level = NULL;
+  topology->first_misc = topology->last_misc = NULL;
+
+  /* Create the actual machine object, but don't touch its attributes yet
+   * since the OS backend may still change the object into something else
+   * (for instance System)
+   */
+  root_obj = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, 0);
+  topology->levels[0][0] = root_obj;
+}
+
+int
+hwloc_topology_init (struct hwloc_topology **topologyp)
+{
+  struct hwloc_topology *topology;
+  int i;
+
+  topology = malloc (sizeof (struct hwloc_topology));
+  if(!topology)
+    return -1;
+
+  hwloc_components_init(topology);
+
+  /* Setup topology context */
+  topology->is_loaded = 0;
+  topology->flags = 0;
+  topology->is_thissystem = 1;
+  topology->pid = 0;
+  topology->userdata = NULL;
+
+  topology->support.discovery = malloc(sizeof(*topology->support.discovery));
+  topology->support.cpubind = malloc(sizeof(*topology->support.cpubind));
+  topology->support.membind = malloc(sizeof(*topology->support.membind));
+
+  /* Only ignore useless cruft by default */
+  for(i = HWLOC_OBJ_SYSTEM; i < HWLOC_OBJ_TYPE_MAX; i++)
+    topology->ignored_types[i] = HWLOC_IGNORE_TYPE_NEVER;
+  topology->ignored_types[HWLOC_OBJ_GROUP] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+
+  hwloc_distances_init(topology);
+
+  topology->userdata_export_cb = NULL;
+  topology->userdata_import_cb = NULL;
+
+  /* Make the topology look like something coherent but empty */
+  hwloc_topology_setup_defaults(topology);
+
+  *topologyp = topology;
+  return 0;
+}
+
+int
+hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
+                       hwloc_pid_t pid __hwloc_attribute_unused)
+{
+  /* this does *not* change the backend */
+#ifdef HWLOC_LINUX_SYS
+  topology->pid = pid;
+  return 0;
+#else /* HWLOC_LINUX_SYS */
+  errno = ENOSYS;
+  return -1;
+#endif /* HWLOC_LINUX_SYS */
+}
+
+int
+hwloc_topology_set_synthetic(struct hwloc_topology *topology, const char *description)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "synthetic",
+					   description, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xml(struct hwloc_topology *topology,
+		       const char *xmlpath)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml",
+					   xmlpath, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xmlbuffer(struct hwloc_topology *topology,
+                             const char *xmlbuffer,
+                             int size)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml", NULL,
+					   xmlbuffer, (void*) (uintptr_t) size);
+}
+
+int
+hwloc_topology_set_flags (struct hwloc_topology *topology, unsigned long flags)
+{
+  if (topology->is_loaded) {
+    /* actually harmless */
+    errno = EBUSY;
+    return -1;
+  }
+  topology->flags = flags;
+  return 0;
+}
+
+unsigned long
+hwloc_topology_get_flags (struct hwloc_topology *topology)
+{
+  return topology->flags;
+}
+
+int
+hwloc_topology_ignore_type(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  if (type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE) {
+    /* we need the PU and NUMA levels */
+    errno = EINVAL;
+    return -1;
+  } else if (hwloc_obj_type_is_io(type)) {
+    /* I/O devices aren't in any level, use topology flags to ignore them */
+    errno = EINVAL;
+    return -1;
+  }
+
+  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_ALWAYS;
+  return 0;
+}
+
+int
+hwloc_topology_ignore_type_keep_structure(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  if (type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MISC) {
+    /* We need the PU and NUMA levels.
+     * Misc are outside of the main topology structure, makes no sense.
+     */
+    errno = EINVAL;
+    return -1;
+  } else if (hwloc_obj_type_is_io(type)) {
+    /* I/O devices aren't in any level, use topology flags to ignore them */
+    errno = EINVAL;
+    return -1;
+  }
+
+  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  return 0;
+}
+
+int
+hwloc_topology_ignore_all_keep_structure(struct hwloc_topology *topology)
+{
+  unsigned type;
+  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++)
+    if (type != HWLOC_OBJ_PU && type != HWLOC_OBJ_NUMANODE
+	&& !hwloc_obj_type_is_io((hwloc_obj_type_t) type))
+      topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  return 0;
+}
+
+/* traverse the tree and free everything.
+ * only use first_child/next_sibling so that it works before load()
+ * and may be used when switching between backend.
+ */
+static void
+hwloc_topology_clear_tree (struct hwloc_topology *topology, struct hwloc_obj *root)
+{
+  hwloc_obj_t child;
+  child = root->first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  child = root->io_first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  child = root->misc_first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  hwloc_free_unlinked_object (root);
+}
+
+void
+hwloc_topology_clear (struct hwloc_topology *topology)
+{
+  unsigned l;
+  hwloc_topology_clear_tree (topology, topology->levels[0][0]);
+  for (l=0; l<topology->nb_levels; l++) {
+    free(topology->levels[l]);
+    topology->levels[l] = NULL;
+  }
+  free(topology->bridge_level);
+  free(topology->pcidev_level);
+  free(topology->osdev_level);
+  free(topology->misc_level);
+}
+
+void
+hwloc_topology_destroy (struct hwloc_topology *topology)
+{
+  hwloc_backends_disable_all(topology);
+  hwloc_components_destroy_all(topology);
+
+  hwloc_topology_clear(topology);
+  hwloc_distances_destroy(topology);
+
+  free(topology->support.discovery);
+  free(topology->support.cpubind);
+  free(topology->support.membind);
+  free(topology);
+}
+
+int
+hwloc_topology_load (struct hwloc_topology *topology)
+{
+  int err;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  /* Only apply variables if we have not changed the backend yet.
+   * Only the last one will be kept.
+   * Check for XML last (that's the one that may be set system-wide by administrators)
+   * so that it's only used if other variables are not set,
+   * to allow users to override easily.
+   */
+  if (!topology->backends) {
+    const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
+    if (synthetic_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					-1, "synthetic",
+					synthetic_env, NULL, NULL);
+  }
+  if (!topology->backends) {
+    const char *fsroot_path_env = getenv("HWLOC_FSROOT");
+    if (fsroot_path_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					HWLOC_DISC_COMPONENT_TYPE_CPU, "linux",
+					fsroot_path_env, NULL, NULL);
+  }
+  if (!topology->backends) {
+    const char *xmlpath_env = getenv("HWLOC_XMLFILE");
+    if (xmlpath_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					-1, "xml",
+					xmlpath_env, NULL, NULL);
+  }
+
+  /* instantiate all possible other backends now */
+  hwloc_disc_components_enable_others(topology);
+  /* now that backends are enabled, update the thissystem flag */
+  hwloc_backends_is_thissystem(topology);
+
+  /* get distance matrix from the environment are store them (as indexes) in the topology.
+   * indexes will be converted into objects later once the tree will be filled
+   */
+  hwloc_distances_set_from_env(topology);
+
+  /* actual topology discovery */
+  err = hwloc_discover(topology);
+  if (err < 0)
+    goto out;
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  topology->is_loaded = 1;
+  return 0;
+
+ out:
+  hwloc_topology_clear(topology);
+  hwloc_distances_destroy(topology);
+  hwloc_topology_setup_defaults(topology);
+  hwloc_backends_disable_all(topology);
+  return -1;
+}
+
+/* adjust object cpusets according the given droppedcpuset,
+ * drop object whose cpuset becomes empty,
+ * and mark dropped nodes in droppednodeset
+ */
+static void
+restrict_object(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj, hwloc_const_cpuset_t droppedcpuset, hwloc_nodeset_t droppednodeset, int droppingparent)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+  int dropping;
+  int modified = hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset);
+
+  hwloc_clear_object_distances(obj);
+
+  hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+  hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+  hwloc_bitmap_andnot(obj->allowed_cpuset, obj->allowed_cpuset, droppedcpuset);
+
+  dropping = droppingparent || hwloc_bitmap_iszero(obj->cpuset);
+
+  if (modified) {
+    for_each_child_safe(child, obj, pchild)
+      restrict_object(topology, flags, pchild, droppedcpuset, droppednodeset, dropping);
+    /* Nothing to restrict under I/O or Misc */
+  }
+
+  if (dropping) {
+    hwloc_debug("%s", "\nRemoving object during restrict");
+    hwloc_debug_print_object(0, obj);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      hwloc_bitmap_set(droppednodeset, obj->os_index);
+    if (obj->io_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO))
+      unlink_and_free_object_and_children(&obj->io_first_child);
+    if (obj->misc_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC))
+      unlink_and_free_object_and_children(&obj->misc_first_child);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+    /* do not remove children. if they were to be removed, they would have been already */
+  }
+}
+
+/* adjust object nodesets accordingly the given droppednodeset
+ */
+static void
+restrict_object_nodeset(hwloc_topology_t topology, hwloc_obj_t *pobj, hwloc_nodeset_t droppednodeset)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  /* if this object isn't modified, don't bother looking at children */
+  if (!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset))
+    return;
+
+  hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+  hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+  hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, droppednodeset);
+
+  for_each_child_safe(child, obj, pchild)
+    restrict_object_nodeset(topology, pchild, droppednodeset);
+  /* Nothing to restrict under I/O and Misc */
+}
+
+int
+hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_cpuset_t cpuset, unsigned long flags)
+{
+  hwloc_bitmap_t droppedcpuset, droppednodeset;
+
+  /* make sure we'll keep something in the topology */
+  if (!hwloc_bitmap_intersects(cpuset, topology->levels[0][0]->cpuset)) {
+    errno = EINVAL; /* easy failure, just don't touch the topology */
+    return -1;
+  }
+
+  droppedcpuset = hwloc_bitmap_alloc();
+  droppednodeset = hwloc_bitmap_alloc();
+
+  /* drop object based on the reverse of cpuset, and fill the 'dropped' nodeset */
+  hwloc_bitmap_not(droppedcpuset, cpuset);
+  restrict_object(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset, 0 /* root cannot be removed */);
+  /* update nodesets according to dropped nodeset */
+  restrict_object_nodeset(topology, &topology->levels[0][0], droppednodeset);
+
+  hwloc_bitmap_free(droppedcpuset);
+  hwloc_bitmap_free(droppednodeset);
+
+  hwloc_connect_children(topology->levels[0][0]);
+  if (hwloc_connect_levels(topology) < 0)
+    goto out;
+  topology->modified = 0;
+
+  propagate_total_memory(topology->levels[0][0]);
+  hwloc_distances_restrict(topology, flags);
+  hwloc_distances_finalize_os(topology);
+  hwloc_distances_finalize_logical(topology);
+  return 0;
+
+ out:
+  /* unrecoverable failure, re-init the topology */
+   hwloc_topology_clear(topology);
+   hwloc_distances_destroy(topology);
+   hwloc_topology_setup_defaults(topology);
+   return -1;
+}
+
+int
+hwloc_topology_is_thissystem(struct hwloc_topology *topology)
+{
+  return topology->is_thissystem;
+}
+
+unsigned
+hwloc_topology_get_depth(struct hwloc_topology *topology)
+{
+  return topology->nb_levels;
+}
+
+const struct hwloc_topology_support *
+hwloc_topology_get_support(struct hwloc_topology * topology)
+{
+  return &topology->support;
+}
+
+void hwloc_topology_set_userdata(struct hwloc_topology * topology, const void *userdata)
+{
+  topology->userdata = (void *) userdata;
+}
+
+void * hwloc_topology_get_userdata(struct hwloc_topology * topology)
+{
+  return topology->userdata;
+}
+
+/****************
+ * Debug Checks *
+ ****************/
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj);
+
+/* check children between a parent object */
+static void
+hwloc__check_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+
+  if (!parent->arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->children);
+    assert(!parent->first_child);
+    assert(!parent->last_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->children);
+  assert(parent->first_child);
+  assert(parent->last_child);
+
+  /* sibling checks */
+  for(j=0; j<parent->arity; j++) {
+    hwloc_obj_t child = parent->children[j];
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (j)
+      assert(child->prev_sibling == parent->children[j-1]);
+    else
+      assert(!child->prev_sibling);
+    if (j == parent->arity-1)
+      assert(!child->next_sibling);
+    else
+      assert(child->next_sibling == parent->children[j+1]);
+    if (!hwloc_obj_type_is_io(child->type))
+      assert(child->depth > parent->depth);
+    /* recurse */
+    hwloc__check_object(topology, child);
+  }
+  assert(parent->first_child == parent->children[0]);
+  assert(parent->last_child == parent->children[parent->arity-1]);
+
+  /* we already checked in the caller that objects have either all sets or none */
+
+  {
+    /* check that parent->cpuset == exclusive OR of children
+     * (can be wrong for complete_cpuset since disallowed/offline/unknown PUs can be removed)
+     */
+    hwloc_bitmap_t remaining_parent_cpuset = hwloc_bitmap_dup(parent->cpuset);
+    hwloc_bitmap_t remaining_parent_nodeset = hwloc_bitmap_dup(parent->nodeset);
+    for(j=0; j<parent->arity; j++) {
+      if (!parent->children[j]->cpuset)
+	continue;
+      /* check that child cpuset is included in the reminder of the parent */
+      assert(hwloc_bitmap_isincluded(parent->children[j]->cpuset, remaining_parent_cpuset));
+      hwloc_bitmap_andnot(remaining_parent_cpuset, remaining_parent_cpuset, parent->children[j]->cpuset);
+      /* check that child cpuset is included in the parent (multiple children may have the same nodeset when we're below a NUMA node) */
+      assert(hwloc_bitmap_isincluded(parent->children[j]->nodeset, parent->nodeset));
+      hwloc_bitmap_andnot(remaining_parent_nodeset, remaining_parent_nodeset, parent->children[j]->nodeset);
+    }
+
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* if parent is a PU (with Misc children for instance),
+       * its os_index bit may remain in cpuset. */
+      assert(hwloc_bitmap_weight(remaining_parent_cpuset) == 1);
+      assert(hwloc_bitmap_first(remaining_parent_cpuset) == (int)parent->os_index);
+    } else {
+      /* nothing remains */
+      assert(hwloc_bitmap_iszero(remaining_parent_cpuset));
+    }
+    hwloc_bitmap_free(remaining_parent_cpuset);
+
+    if (parent->type == HWLOC_OBJ_NUMANODE)
+      /* if parent is a NUMA node, its os_index bit may remain.
+       * or it could already have been removed by a child. */
+      hwloc_bitmap_clr(remaining_parent_nodeset, parent->os_index);
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* if parent is a PU (with Misc children for instance),
+       * one bit may remain in nodeset. */
+      assert(hwloc_bitmap_weight(remaining_parent_nodeset) == 1);
+    } else {
+      /* nothing remains */
+      assert(hwloc_bitmap_iszero(remaining_parent_nodeset));
+    }
+    hwloc_bitmap_free(remaining_parent_nodeset);
+  }
+
+  /* check that children complete_cpuset are properly ordered, empty ones may be anywhere
+   * (can be wrong for main cpuset since removed PUs can break the ordering).
+   */
+  {
+    int firstchild;
+    int prev_firstchild = -1; /* -1 works fine with first comparisons below */
+    for(j=0; j<parent->arity; j++) {
+      if (!parent->children[j]->complete_cpuset
+	  || hwloc_bitmap_iszero(parent->children[j]->complete_cpuset))
+	continue;
+
+      firstchild = hwloc_bitmap_first(parent->children[j]->complete_cpuset);
+      assert(prev_firstchild < firstchild);
+      prev_firstchild = firstchild;
+    }
+  }
+}
+
+static void
+hwloc__check_io_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->io_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->io_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->io_first_child);
+
+  for(prev = NULL, child = parent->io_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be I/O */
+    assert(hwloc_obj_type_is_io(child->type));
+
+    /* check siblings */
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (prev)
+      assert(prev->next_sibling == child);
+    assert(child->prev_sibling == prev);
+    if (j == parent->io_arity-1)
+      assert(child->next_sibling == NULL);
+
+    /* only I/O and Misc children, recurse */
+    assert(!child->first_child);
+    hwloc__check_object(topology, child);
+  }
+  /* check arity */
+  assert(j == parent->io_arity);
+}
+
+static void
+hwloc__check_misc_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->misc_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->misc_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->misc_first_child);
+
+  for(prev = NULL, child = parent->misc_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be Misc */
+    assert(child->type == HWLOC_OBJ_MISC);
+
+    /* check siblings */
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (prev)
+      assert(prev->next_sibling == child);
+    assert(child->prev_sibling == prev);
+    if (j == parent->misc_arity-1)
+      assert(child->next_sibling == NULL);
+
+    /* only Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->io_first_child);
+    hwloc__check_object(topology, child);
+  }
+  /* check arity */
+  assert(j == parent->misc_arity);
+}
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  /* check that sets and depth */
+  if (hwloc_obj_type_is_special(obj->type)) {
+    assert(!obj->cpuset);
+    if (obj->type == HWLOC_OBJ_BRIDGE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_BRIDGE);
+    else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_PCI_DEVICE);
+    else if (obj->type == HWLOC_OBJ_OS_DEVICE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_OS_DEVICE);
+    else if (obj->type == HWLOC_OBJ_MISC)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_MISC);
+  } else {
+    assert(obj->cpuset);
+    assert((int) obj->depth >= 0);
+  }
+
+  /* there's other cpusets and nodesets if and only if there's a main cpuset */
+  assert(!!obj->cpuset == !!obj->complete_cpuset);
+  assert(!!obj->cpuset == !!obj->allowed_cpuset);
+  assert(!!obj->cpuset == !!obj->nodeset);
+  assert(!!obj->nodeset == !!obj->complete_nodeset);
+  assert(!!obj->nodeset == !!obj->allowed_nodeset);
+
+  /* check that complete/allowed/inline sets are larger than the main sets */
+  if (obj->cpuset) {
+    assert(hwloc_bitmap_isincluded(obj->cpuset, obj->complete_cpuset));
+    assert(hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset));
+    if (topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) {
+      assert(hwloc_bitmap_isincluded(obj->allowed_cpuset, obj->cpuset));
+      assert(hwloc_bitmap_isincluded(obj->allowed_nodeset, obj->nodeset));
+    } else {
+      assert(hwloc_bitmap_isequal(obj->allowed_cpuset, obj->cpuset));
+      assert(hwloc_bitmap_isequal(obj->allowed_nodeset, obj->nodeset));
+    }
+  }
+
+  /* check children */
+  hwloc__check_children(topology, obj);
+  hwloc__check_io_children(topology, obj);
+  hwloc__check_misc_children(topology, obj);
+}
+
+static void
+hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
+{
+  unsigned width = hwloc_get_nbobjs_by_depth(topology, depth);
+  struct hwloc_obj *prev = NULL;
+  hwloc_obj_t obj;
+  unsigned j;
+
+  /* check each object of the level */
+  for(j=0; j<width; j++) {
+    obj = hwloc_get_obj_by_depth(topology, depth, j);
+    /* check that the object is corrected placed horizontally and vertically */
+    assert(obj);
+    assert(obj->depth == depth);
+    assert(obj->logical_index == j);
+    /* check that all objects in the level have the same type */
+    if (prev) {
+      assert(hwloc_type_cmp(obj, prev) == HWLOC_TYPE_EQUAL);
+      assert(prev->next_cousin == obj);
+    }
+    assert(obj->prev_cousin == prev);
+
+    /* check that PUs and NUMA nodes have correct cpuset/nodeset */
+    if (obj->type == HWLOC_OBJ_PU) {
+      assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
+      assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
+    }
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+      assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+    }
+    prev = obj;
+  }
+  if (prev)
+    assert(prev->next_cousin == NULL);
+
+  if (width) {
+    /* check first object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    assert(obj);
+    assert(!obj->prev_cousin);
+    /* check type */
+    assert(hwloc_get_depth_type(topology, depth) == obj->type);
+    assert(depth == (unsigned) hwloc_get_type_depth(topology, obj->type)
+	   || HWLOC_TYPE_DEPTH_MULTIPLE == hwloc_get_type_depth(topology, obj->type));
+    /* check last object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, width-1);
+    assert(obj);
+    assert(!obj->next_cousin);
+  }
+
+  /* check last+1 object of the level */
+  obj = hwloc_get_obj_by_depth(topology, depth, width);
+  assert(!obj);
+}
+
+/* check a whole topology structure */
+void
+hwloc_topology_check(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *obj;
+  hwloc_obj_type_t type;
+  unsigned i, j, depth;
+
+  depth = hwloc_topology_get_depth(topology);
+
+  assert(!topology->modified);
+
+  /* check type orders */
+  for (type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+    assert(hwloc_get_order_type(hwloc_get_type_order(type)) == type);
+  }
+  for (i = hwloc_get_type_order(HWLOC_OBJ_SYSTEM);
+       i <= hwloc_get_type_order(HWLOC_OBJ_CORE); i++) {
+    assert(i == hwloc_get_type_order(hwloc_get_order_type(i)));
+  }
+
+  /* check that last level is PU */
+  assert(hwloc_get_depth_type(topology, depth-1) == HWLOC_OBJ_PU);
+  assert(hwloc_get_nbobjs_by_depth(topology, depth-1) > 0);
+  for(j=0; j<hwloc_get_nbobjs_by_depth(topology, depth-1); j++) {
+    obj = hwloc_get_obj_by_depth(topology, depth-1, j);
+    assert(obj);
+    assert(obj->type == HWLOC_OBJ_PU);
+  }
+  /* check that other levels are not PU */
+  for(i=1; i<depth-1; i++)
+    assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_PU);
+
+  /* check that we have a NUMA level */
+  j = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  assert(j < hwloc_topology_get_depth(topology));
+  assert(hwloc_get_depth_type(topology, j) == HWLOC_OBJ_NUMANODE);
+  /* check that other levels are not NUMA */
+  for(i=0; i<depth-1; i++)
+    if (i != j)
+      assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_NUMANODE);
+
+  /* top-level specific checks */
+  assert(hwloc_get_nbobjs_by_depth(topology, 0) == 1);
+  obj = hwloc_get_root_obj(topology);
+  assert(obj);
+  assert(!obj->parent);
+  assert(obj->cpuset);
+  assert(!obj->depth);
+
+  /* check each level */
+  for(i=0; i<depth; i++)
+    hwloc__check_level(topology, i);
+  hwloc__check_level(topology, HWLOC_OBJ_BRIDGE);
+  hwloc__check_level(topology, HWLOC_OBJ_PCI_DEVICE);
+  hwloc__check_level(topology, HWLOC_OBJ_OS_DEVICE);
+  hwloc__check_level(topology, HWLOC_OBJ_MISC);
+
+  /* recurse and check the tree of children, and type-specific checks */
+  hwloc__check_object(topology, obj);
+}
diff --git a/ext/hwloc/hwloc/traversal.c b/ext/hwloc/hwloc/traversal.c
new file mode 100644
index 0000000..f1e9ba7
--- /dev/null
+++ b/ext/hwloc/hwloc/traversal.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif /* HAVE_STRINGS_H */
+
+int
+hwloc_get_type_depth (struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  return topology->type_depth[type];
+}
+
+hwloc_obj_type_t
+hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return HWLOC_OBJ_BRIDGE;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return HWLOC_OBJ_PCI_DEVICE;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return HWLOC_OBJ_OS_DEVICE;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return HWLOC_OBJ_MISC;
+    default:
+      return (hwloc_obj_type_t) -1;
+    }
+  return topology->levels[depth][0]->type;
+}
+
+unsigned
+hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, unsigned depth)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return topology->bridge_nbobjects;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return topology->pcidev_nbobjects;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return topology->osdev_nbobjects;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return topology->misc_nbobjects;
+    default:
+      return 0;
+    }
+  return topology->level_nbobjects[depth];
+}
+
+struct hwloc_obj *
+hwloc_get_obj_by_depth (struct hwloc_topology *topology, unsigned depth, unsigned idx)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return idx < topology->bridge_nbobjects ? topology->bridge_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return idx < topology->pcidev_nbobjects ? topology->pcidev_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return idx < topology->osdev_nbobjects ? topology->osdev_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return idx < topology->misc_nbobjects ? topology->misc_level[idx] : NULL;
+    default:
+      return NULL;
+    }
+  if (idx >= topology->level_nbobjects[depth])
+    return NULL;
+  return topology->levels[depth][idx];
+}
+
+unsigned hwloc_get_closest_objs (struct hwloc_topology *topology, struct hwloc_obj *src, struct hwloc_obj **objs, unsigned max)
+{
+  struct hwloc_obj *parent, *nextparent, **src_objs;
+  int i,src_nbobjects;
+  unsigned stored = 0;
+
+  if (!src->cpuset)
+    return 0;
+
+  src_nbobjects = topology->level_nbobjects[src->depth];
+  src_objs = topology->levels[src->depth];
+
+  parent = src;
+  while (stored < max) {
+    while (1) {
+      nextparent = parent->parent;
+      if (!nextparent)
+	goto out;
+      if (!hwloc_bitmap_isequal(parent->cpuset, nextparent->cpuset))
+	break;
+      parent = nextparent;
+    }
+
+    /* traverse src's objects and find those that are in nextparent and were not in parent */
+    for(i=0; i<src_nbobjects; i++) {
+      if (hwloc_bitmap_isincluded(src_objs[i]->cpuset, nextparent->cpuset)
+	  && !hwloc_bitmap_isincluded(src_objs[i]->cpuset, parent->cpuset)) {
+	objs[stored++] = src_objs[i];
+	if (stored == max)
+	  goto out;
+      }
+    }
+    parent = nextparent;
+  }
+
+ out:
+  return stored;
+}
+
+static int
+hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bitmap_t set,
+				       struct hwloc_obj ***res, int *max)
+{
+  int gotten = 0;
+  unsigned i;
+
+  /* the caller must ensure this */
+  if (*max <= 0)
+    return 0;
+
+  if (hwloc_bitmap_isequal(current->cpuset, set)) {
+    **res = current;
+    (*res)++;
+    (*max)--;
+    return 1;
+  }
+
+  for (i=0; i<current->arity; i++) {
+    hwloc_bitmap_t subset = hwloc_bitmap_dup(set);
+    int ret;
+
+    /* split out the cpuset part corresponding to this child and see if there's anything to do */
+    hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
+    if (hwloc_bitmap_iszero(subset)) {
+      hwloc_bitmap_free(subset);
+      continue;
+    }
+
+    ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
+    gotten += ret;
+    hwloc_bitmap_free(subset);
+
+    /* if no more room to store remaining objects, return what we got so far */
+    if (!*max)
+      break;
+  }
+
+  return gotten;
+}
+
+int
+hwloc_get_largest_objs_inside_cpuset (struct hwloc_topology *topology, hwloc_const_bitmap_t set,
+				      struct hwloc_obj **objs, int max)
+{
+  struct hwloc_obj *current = topology->levels[0][0];
+
+  if (!hwloc_bitmap_isincluded(set, current->cpuset))
+    return -1;
+
+  if (max <= 0)
+    return 0;
+
+  return hwloc__get_largest_objs_inside_cpuset (current, set, &objs, &max);
+}
+
+const char *
+hwloc_obj_type_string (hwloc_obj_type_t obj)
+{
+  switch (obj)
+    {
+    case HWLOC_OBJ_SYSTEM: return "System";
+    case HWLOC_OBJ_MACHINE: return "Machine";
+    case HWLOC_OBJ_MISC: return "Misc";
+    case HWLOC_OBJ_GROUP: return "Group";
+    case HWLOC_OBJ_NUMANODE: return "NUMANode";
+    case HWLOC_OBJ_PACKAGE: return "Package";
+    case HWLOC_OBJ_CACHE: return "Cache";
+    case HWLOC_OBJ_CORE: return "Core";
+    case HWLOC_OBJ_BRIDGE: return "Bridge";
+    case HWLOC_OBJ_PCI_DEVICE: return "PCIDev";
+    case HWLOC_OBJ_OS_DEVICE: return "OSDev";
+    case HWLOC_OBJ_PU: return "PU";
+    default: return "Unknown";
+    }
+}
+
+hwloc_obj_type_t
+hwloc_obj_type_of_string (const char * string)
+{
+  if (!strcasecmp(string, "System")) return HWLOC_OBJ_SYSTEM;
+  if (!strcasecmp(string, "Machine")) return HWLOC_OBJ_MACHINE;
+  if (!strcasecmp(string, "Misc")) return HWLOC_OBJ_MISC;
+  if (!strcasecmp(string, "Group")) return HWLOC_OBJ_GROUP;
+  if (!strcasecmp(string, "NUMANode") || !strcasecmp(string, "Node")) return HWLOC_OBJ_NUMANODE;
+  if (!strcasecmp(string, "Package") || !strcasecmp(string, "Socket") /* backward compat with v1.10 */) return HWLOC_OBJ_PACKAGE;
+  if (!strcasecmp(string, "Cache")) return HWLOC_OBJ_CACHE;
+  if (!strcasecmp(string, "Core")) return HWLOC_OBJ_CORE;
+  if (!strcasecmp(string, "PU")) return HWLOC_OBJ_PU;
+  if (!strcasecmp(string, "Bridge")) return HWLOC_OBJ_BRIDGE;
+  if (!strcasecmp(string, "PCIDev")) return HWLOC_OBJ_PCI_DEVICE;
+  if (!strcasecmp(string, "OSDev")) return HWLOC_OBJ_OS_DEVICE;
+  return (hwloc_obj_type_t) -1;
+}
+
+int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+{
+  hwloc_obj_type_t type = (hwloc_obj_type_t) -1;
+  int depthattr = -1;
+  hwloc_obj_cache_type_t cachetypeattr = (hwloc_obj_cache_type_t) -1; /* unspecified */
+  char *end;
+
+  /* types without depthattr */
+  if (!hwloc_strncasecmp(string, "system", 2)) {
+    type = HWLOC_OBJ_SYSTEM;
+  } else if (!hwloc_strncasecmp(string, "machine", 2)) {
+    type = HWLOC_OBJ_MACHINE;
+  } else if (!hwloc_strncasecmp(string, "node", 1)
+	     || !hwloc_strncasecmp(string, "numa", 1)) { /* matches node and numanode */
+    type = HWLOC_OBJ_NUMANODE;
+  } else if (!hwloc_strncasecmp(string, "package", 2)
+	     || !hwloc_strncasecmp(string, "socket", 2)) { /* backward compat with v1.10 */
+    type = HWLOC_OBJ_PACKAGE;
+  } else if (!hwloc_strncasecmp(string, "core", 2)) {
+    type = HWLOC_OBJ_CORE;
+  } else if (!hwloc_strncasecmp(string, "pu", 2)) {
+    type = HWLOC_OBJ_PU;
+  } else if (!hwloc_strncasecmp(string, "misc", 2)) {
+    type = HWLOC_OBJ_MISC;
+  } else if (!hwloc_strncasecmp(string, "bridge", 2)) {
+    type = HWLOC_OBJ_BRIDGE;
+  } else if (!hwloc_strncasecmp(string, "pci", 2)) {
+    type = HWLOC_OBJ_PCI_DEVICE;
+  } else if (!hwloc_strncasecmp(string, "os", 2)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+
+  /* types with depthattr */
+  } else if (!hwloc_strncasecmp(string, "cache", 2)) {
+    type = HWLOC_OBJ_CACHE;
+
+  } else if ((string[0] == 'l' || string[0] == 'L') && string[1] >= '0' && string[1] <= '9') {
+    type = HWLOC_OBJ_CACHE;
+    depthattr = strtol(string+1, &end, 10);
+    if (*end == 'd') {
+      cachetypeattr = HWLOC_OBJ_CACHE_DATA;
+    } else if (*end == 'i') {
+      cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
+    } else if (*end == 'u') {
+      cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+    }
+
+  } else if (!hwloc_strncasecmp(string, "group", 2)) {
+    int length;
+    type = HWLOC_OBJ_GROUP;
+    length = strcspn(string, "0123456789");
+    if (length <= 5 && !hwloc_strncasecmp(string, "group", length)
+	&& string[length] >= '0' && string[length] <= '9') {
+      depthattr = strtol(string+length, &end, 10);
+    }
+  } else
+    return -1;
+
+  *typep = type;
+  if (depthattrp)
+    *depthattrp = depthattr;
+  if (typeattrp) {
+    if (type == HWLOC_OBJ_CACHE && sizeof(hwloc_obj_cache_type_t) <= typeattrsize)
+      memcpy(typeattrp, &cachetypeattr, sizeof(hwloc_obj_cache_type_t));
+  }
+
+  return 0;
+}
+
+static const char *
+hwloc_pci_class_string(unsigned short class_id)
+{
+  switch ((class_id & 0xff00) >> 8) {
+    case 0x00:
+      switch (class_id) {
+	case 0x0001: return "VGA";
+      }
+      return "PCI";
+    case 0x01:
+      switch (class_id) {
+	case 0x0100: return "SCSI";
+	case 0x0101: return "IDE";
+	case 0x0102: return "Flop";
+	case 0x0103: return "IPI";
+	case 0x0104: return "RAID";
+	case 0x0105: return "ATA";
+	case 0x0106: return "SATA";
+	case 0x0107: return "SAS";
+	case 0x0108: return "NVMExp";
+      }
+      return "Stor";
+    case 0x02:
+      switch (class_id) {
+	case 0x0200: return "Ether";
+	case 0x0201: return "TokRn";
+	case 0x0202: return "FDDI";
+	case 0x0203: return "ATM";
+	case 0x0204: return "ISDN";
+	case 0x0205: return "WrdFip";
+	case 0x0206: return "PICMG";
+	case 0x0207: return "IB";
+      }
+      return "Net";
+    case 0x03:
+      switch (class_id) {
+	case 0x0300: return "VGA";
+	case 0x0301: return "XGA";
+	case 0x0302: return "3D";
+      }
+      return "Disp";
+    case 0x04:
+      switch (class_id) {
+	case 0x0400: return "Video";
+	case 0x0401: return "Audio";
+	case 0x0402: return "Phone";
+	case 0x0403: return "Auddv";
+      }
+      return "MM";
+    case 0x05:
+      switch (class_id) {
+	case 0x0500: return "RAM";
+	case 0x0501: return "Flash";
+      }
+      return "Mem";
+    case 0x06:
+      switch (class_id) {
+	case 0x0600: return "Host";
+	case 0x0601: return "ISA";
+	case 0x0602: return "EISA";
+	case 0x0603: return "MC";
+	case 0x0604: return "PCI_B";
+	case 0x0605: return "PCMCIA";
+	case 0x0606: return "Nubus";
+	case 0x0607: return "CardBus";
+	case 0x0608: return "RACEway";
+	case 0x0609: return "PCI_SB";
+	case 0x060a: return "IB_B";
+      }
+      return "Bridg";
+    case 0x07:
+      switch (class_id) {
+	case 0x0700: return "Ser";
+	case 0x0701: return "Para";
+	case 0x0702: return "MSer";
+	case 0x0703: return "Modm";
+	case 0x0704: return "GPIB";
+	case 0x0705: return "SmrtCrd";
+      }
+      return "Comm";
+    case 0x08:
+      switch (class_id) {
+	case 0x0800: return "PIC";
+	case 0x0801: return "DMA";
+	case 0x0802: return "Time";
+	case 0x0803: return "RTC";
+	case 0x0804: return "HtPl";
+	case 0x0805: return "SD-HtPl";
+	case 0x0806: return "IOMMU";
+      }
+      return "Syst";
+    case 0x09:
+      switch (class_id) {
+	case 0x0900: return "Kbd";
+	case 0x0901: return "Pen";
+	case 0x0902: return "Mouse";
+	case 0x0903: return "Scan";
+	case 0x0904: return "Game";
+      }
+      return "In";
+    case 0x0a:
+      return "Dock";
+    case 0x0b:
+      switch (class_id) {
+	case 0x0b00: return "386";
+	case 0x0b01: return "486";
+	case 0x0b02: return "Pent";
+	case 0x0b10: return "Alpha";
+	case 0x0b20: return "PPC";
+	case 0x0b30: return "MIPS";
+	case 0x0b40: return "CoProc";
+      }
+      return "Proc";
+    case 0x0c:
+      switch (class_id) {
+	case 0x0c00: return "Firw";
+	case 0x0c01: return "ACCES";
+	case 0x0c02: return "SSA";
+	case 0x0c03: return "USB";
+	case 0x0c04: return "Fiber";
+	case 0x0c05: return "SMBus";
+	case 0x0c06: return "IB";
+	case 0x0c07: return "IPMI";
+	case 0x0c08: return "SERCOS";
+	case 0x0c09: return "CANBUS";
+      }
+      return "Ser";
+    case 0x0d:
+      switch (class_id) {
+	case 0x0d00: return "IRDA";
+	case 0x0d01: return "IR";
+	case 0x0d10: return "RF";
+	case 0x0d11: return "Blueth";
+	case 0x0d12: return "BroadB";
+	case 0x0d20: return "802.1a";
+	case 0x0d21: return "802.1b";
+      }
+      return "Wifi";
+    case 0x0e:
+      switch (class_id) {
+	case 0x0e00: return "I2O";
+      }
+      return "Intll";
+    case 0x0f:
+      switch (class_id) {
+	case 0x0f00: return "S-TV";
+	case 0x0f01: return "S-Aud";
+	case 0x0f02: return "S-Voice";
+	case 0x0f03: return "S-Data";
+      }
+      return "Satel";
+    case 0x10:
+      return "Crypt";
+    case 0x11:
+      return "Signl";
+    case 0x12:
+      return "Accel";
+    case 0x13:
+      return "Instr";
+    case 0xff:
+      return "Oth";
+  }
+  return "PCI";
+}
+
+static const char* hwloc_obj_cache_type_letter(hwloc_obj_cache_type_t type)
+{
+  switch (type) {
+  case HWLOC_OBJ_CACHE_UNIFIED: return "";
+  case HWLOC_OBJ_CACHE_DATA: return "d";
+  case HWLOC_OBJ_CACHE_INSTRUCTION: return "i";
+  default: return "unknown";
+  }
+}
+
+int
+hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, int verbose)
+{
+  hwloc_obj_type_t type = obj->type;
+  switch (type) {
+  case HWLOC_OBJ_MISC:
+  case HWLOC_OBJ_SYSTEM:
+  case HWLOC_OBJ_MACHINE:
+  case HWLOC_OBJ_NUMANODE:
+  case HWLOC_OBJ_PACKAGE:
+  case HWLOC_OBJ_CORE:
+  case HWLOC_OBJ_PU:
+    return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_CACHE:
+    return hwloc_snprintf(string, size, "L%u%s%s", obj->attr->cache.depth,
+			  hwloc_obj_cache_type_letter(obj->attr->cache.type),
+			  verbose ? hwloc_obj_type_string(type): "");
+  case HWLOC_OBJ_GROUP:
+	  /* TODO: more pretty presentation? */
+    if (obj->attr->group.depth != (unsigned) -1)
+      return hwloc_snprintf(string, size, "%s%u", hwloc_obj_type_string(type), obj->attr->group.depth);
+    else
+      return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_BRIDGE:
+    if (verbose)
+      return snprintf(string, size, "Bridge %s->%s",
+		      obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCI" : "Host",
+		      "PCI");
+    else
+      return snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
+  case HWLOC_OBJ_PCI_DEVICE:
+    return snprintf(string, size, "PCI %04x:%04x",
+		    obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
+  case HWLOC_OBJ_OS_DEVICE:
+    switch (obj->attr->osdev.type) {
+    case HWLOC_OBJ_OSDEV_BLOCK: return hwloc_snprintf(string, size, "Block");
+    case HWLOC_OBJ_OSDEV_NETWORK: return hwloc_snprintf(string, size, verbose ? "Network" : "Net");
+    case HWLOC_OBJ_OSDEV_OPENFABRICS: return hwloc_snprintf(string, size, "OpenFabrics");
+    case HWLOC_OBJ_OSDEV_DMA: return hwloc_snprintf(string, size, "DMA");
+    case HWLOC_OBJ_OSDEV_GPU: return hwloc_snprintf(string, size, "GPU");
+    case HWLOC_OBJ_OSDEV_COPROC: return hwloc_snprintf(string, size, verbose ? "Co-Processor" : "CoProc");
+    default:
+      *string = '\0';
+      return 0;
+    }
+    break;
+  default:
+    if (size > 0)
+      *string = '\0';
+    return 0;
+  }
+}
+
+int
+hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * separator, int verbose)
+{
+  const char *prefix = "";
+  char *tmp = string;
+  ssize_t tmplen = size;
+  int ret = 0;
+  int res;
+
+  /* make sure we output at least an empty string */
+  if (size)
+    *string = '\0';
+
+  /* print memory attributes */
+  res = 0;
+  if (verbose) {
+    if (obj->memory.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%slocal=%lu%s%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose),
+			   separator,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+    else if (obj->memory.total_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose));
+  } else {
+    if (obj->memory.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf type-specific attributes */
+  res = 0;
+  switch (obj->type) {
+  case HWLOC_OBJ_CACHE:
+    if (verbose) {
+      char assoc[32];
+      if (obj->attr->cache.associativity == -1)
+	snprintf(assoc, sizeof(assoc), "%sfully-associative", separator);
+      else if (obj->attr->cache.associativity == 0)
+	*assoc = '\0';
+      else
+	snprintf(assoc, sizeof(assoc), "%sways=%d", separator, obj->attr->cache.associativity);
+      res = hwloc_snprintf(tmp, tmplen, "%ssize=%lu%s%slinesize=%u%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose),
+			   separator, obj->attr->cache.linesize,
+			   assoc);
+    } else
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose));
+    break;
+  case HWLOC_OBJ_BRIDGE:
+    if (verbose) {
+      char up[128], down[64];
+      /* upstream is PCI or HOST */
+      if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+        char linkspeed[64]= "";
+        if (obj->attr->pcidev.linkspeed)
+          snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+	snprintf(up, sizeof(up), "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+		 obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+		 obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+      } else
+        *up = '\0';
+      /* downstream is_PCI */
+      snprintf(down, sizeof(down), "buses=%04x:[%02x-%02x]",
+	       obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
+      if (*up)
+	res = snprintf(string, size, "%s%s%s", up, separator, down);
+      else
+	res = snprintf(string, size, "%s", down);
+    }
+    break;
+  case HWLOC_OBJ_PCI_DEVICE:
+    if (verbose) {
+      char linkspeed[64]= "";
+      char busid[16] = "[collapsed]";
+      if (obj->attr->pcidev.linkspeed)
+        snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+      if (!hwloc_obj_get_info_by_name(obj, "lstopoCollapse"))
+	snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func);
+      res = snprintf(string, size, "busid=%s%sclass=%04x(%s)%s",
+		     busid, separator,
+		     obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+    }
+    break;
+  default:
+    break;
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf infos */
+  if (verbose) {
+    unsigned i;
+    for(i=0; i<obj->infos_count; i++) {
+      if (!strcmp(obj->infos[i].name, "lstopoCollapse"))
+	continue;
+      if (strchr(obj->infos[i].value, ' '))
+	res = hwloc_snprintf(tmp, tmplen, "%s%s=\"%s\"",
+			     prefix,
+			     obj->infos[i].name, obj->infos[i].value);
+      else
+	res = hwloc_snprintf(tmp, tmplen, "%s%s=%s",
+			     prefix,
+			     obj->infos[i].name, obj->infos[i].value);
+      if (res < 0)
+        return -1;
+      ret += res;
+      if (res >= tmplen)
+        res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+      if (ret > 0)
+        prefix = separator;
+    }
+  }
+
+  return ret;
+}
+
+
+int
+hwloc_obj_snprintf(char *string, size_t size,
+    struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *l, const char *_indexprefix, int verbose)
+{
+  const char *indexprefix = _indexprefix ? _indexprefix : "#";
+  char os_index[12] = "";
+  char type[64];
+  char attr[128];
+  int attrlen;
+
+  if (l->os_index != (unsigned) -1) {
+    hwloc_snprintf(os_index, 12, "%s%u", indexprefix, l->os_index);
+  }
+
+  hwloc_obj_type_snprintf(type, sizeof(type), l, verbose);
+  attrlen = hwloc_obj_attr_snprintf(attr, sizeof(attr), l, " ", verbose);
+
+  if (attrlen > 0)
+    return hwloc_snprintf(string, size, "%s%s(%s)", type, os_index, attr);
+  else
+    return hwloc_snprintf(string, size, "%s%s", type, os_index);
+}
+
+int hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+  hwloc_bitmap_t set = hwloc_bitmap_alloc();
+  int res;
+  unsigned i;
+
+  hwloc_bitmap_zero(set);
+  for(i=0; i<nobj; i++)
+    if (objs[i]->cpuset)
+      hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+  res = hwloc_bitmap_snprintf(str, size, set);
+  hwloc_bitmap_free(set);
+  return res;
+}
diff --git a/ext/hwloc/include/hwloc.h b/ext/hwloc/include/hwloc.h
new file mode 100644
index 0000000..6c8d203
--- /dev/null
+++ b/ext/hwloc/include/hwloc.h
@@ -0,0 +1,2206 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/*=====================================================================
+ *                 PLEASE GO READ THE DOCUMENTATION!
+ *         ------------------------------------------------
+ *               $tarball_directory/doc/doxygen-doc/
+ *                                or
+ *           http://www.open-mpi.org/projects/hwloc/doc/
+ *=====================================================================
+ *
+ * FAIR WARNING: Do NOT expect to be able to figure out all the
+ * subtleties of hwloc by simply reading function prototypes and
+ * constant descrptions here in this file.
+ *
+ * Hwloc has wonderful documentation in both PDF and HTML formats for
+ * your reading pleasure.  The formal documentation explains a LOT of
+ * hwloc-specific concepts, provides definitions, and discusses the
+ * "big picture" for many of the things that you'll find here in this
+ * header file.
+ *
+ * The PDF/HTML documentation was generated via Doxygen; much of what
+ * you'll see in there is also here in this file.  BUT THERE IS A LOT
+ * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
+ *
+ * There are entire paragraph-length descriptions, discussions, and
+ * pretty prictures to explain subtle corner cases, provide concrete
+ * examples, etc.
+ *
+ * Please, go read the documentation.  :-)
+ *
+ * Moreover there are several examples of hwloc use under doc/examples
+ * in the source tree.
+ *
+ *=====================================================================*/
+
+/** \file
+ * \brief The hwloc API.
+ *
+ * See hwloc/bitmap.h for bitmap specific macros.
+ * See hwloc/helper.h for high-level topology traversal helpers.
+ * See hwloc/inlines.h for the actual inline code of some functions below.
+ */
+
+#ifndef HWLOC_H
+#define HWLOC_H
+
+#include <hwloc/autogen/config.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+/*
+ * Symbol transforms
+ */
+#include <hwloc/rename.h>
+
+/*
+ * Bitmap definitions
+ */
+
+#include <hwloc/bitmap.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_api_version API version
+ * @{
+ */
+
+/** \brief Indicate at build time which hwloc API version is being used. */
+#define HWLOC_API_VERSION 0x00020000
+
+/** \brief Indicate at runtime which hwloc API version was used at build time. */
+HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
+
+/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
+#define HWLOC_COMPONENT_ABI 5
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t)
+ *
+ * Hwloc uses bitmaps to represent two distinct kinds of object sets:
+ * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t).
+ * These types are both typedefs to a common back end type
+ * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions
+ * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see
+ * \ref hwlocality_bitmap).
+ *
+ * The rationale for having two different types is that even though
+ * the actions one wants to perform on these types are the same (e.g.,
+ * enable and disable individual items in the set/mask), they're used
+ * in very different contexts: one for specifying which processors to
+ * use and one for specifying which NUMA nodes to use.  Hence, the
+ * name difference is really just to reflect the intent of where the
+ * type is used.
+ *
+ * @{
+ */
+
+/** \brief A CPU set is a bitmap whose bits are set according to CPU
+ * physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ *
+ * Each bit may be converted into a PU object using
+ * hwloc_get_pu_obj_by_os_index().
+ */
+typedef hwloc_bitmap_t hwloc_cpuset_t;
+/** \brief A non-modifiable ::hwloc_cpuset_t. */
+typedef hwloc_const_bitmap_t hwloc_const_cpuset_t;
+
+/** \brief A node set is a bitmap whose bits are set according to NUMA
+ * memory node physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ * Each bit may be converted into a NUMA node object using
+ * hwloc_get_numanode_obj_by_os_index().
+ *
+ * When binding memory on a system without any NUMA node,
+ * the single main memory bank is considered as NUMA node #0.
+ *
+ * See also \ref hwlocality_helper_nodeset_convert.
+ */
+typedef hwloc_bitmap_t hwloc_nodeset_t;
+/** \brief A non-modifiable ::hwloc_nodeset_t.
+ */
+typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_types Object Types
+ * @{
+ */
+
+/** \brief Type of topology object.
+ *
+ * \note Do not rely on the ordering or completeness of the values as new ones
+ * may be defined in the future!  If you need to compare types, use
+ * hwloc_compare_types() instead.
+ */
+typedef enum {
+    /* ***************************************************************
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+       If new enum values are added here, you MUST also go update the
+       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+       *************************************************************** */
+
+  HWLOC_OBJ_SYSTEM,	/**< \brief Whole system (may be a cluster of machines).
+  			  * The whole system that is accessible to hwloc.
+			  * That may comprise several machines in SSI systems.
+			  */
+  HWLOC_OBJ_MACHINE,	/**< \brief Machine.
+			  * The typical root object type.
+			  * A set of processors and memory with cache
+			  * coherency.
+			  */
+  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
+			  * A set of processors around memory which the
+			  * processors can directly access.
+			  *
+			  * There is always at one such object in the topology
+			  * even if the machine is not NUMA.
+			  */
+  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package, what goes into a socket.
+			  * In the physical meaning, i.e. that you can add
+			  * or remove physically.
+			  */
+  HWLOC_OBJ_CACHE,	/**< \brief Cache.
+			  * Can be L1i, L1d, L2, L3, ...
+			  */
+  HWLOC_OBJ_CORE,	/**< \brief Core.
+			  * A computation unit (may be shared by several
+			  * logical processors).
+			  */
+  HWLOC_OBJ_PU,		/**< \brief Processing Unit, or (Logical) Processor.
+			  * An execution unit (may share a core with some
+			  * other logical processors, e.g. in the case of
+			  * an SMT core).
+			  *
+			  * Objects of this kind are always reported and can
+			  * thus be used as fallback when others are not.
+			  */
+
+  HWLOC_OBJ_GROUP,	/**< \brief Group objects.
+			  * Objects which do not fit in the above but are
+			  * detected by hwloc and are useful to take into
+			  * account for affinity. For instance, some operating systems
+			  * expose their arbitrary processors aggregation this
+			  * way.  And hwloc may insert such objects to group
+			  * NUMA nodes according to their distances.
+			  *
+			  * These objects are ignored when they do not bring
+			  * any structure.
+			  */
+
+  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects.
+			  * Objects without particular meaning, that can e.g. be
+			  * added by the application for its own use, or by hwloc
+			  * for miscellaneous objects such as MemoryDevice.
+			  * These objects are not listed in the main children list,
+			  * but rather in the dedicated misc children list.
+			  * Misc objects may only have Misc objects as children,
+			  * and those are in the dedicated misc children list as well.
+			  * Misc objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge.
+			  * Any bridge that connects the host or an I/O bus,
+			  * to another I/O bus.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
+
+    /* ***************************************************************
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+       If new enum values are added here, you MUST also go update the
+       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+       *************************************************************** */
+} hwloc_obj_type_t;
+
+/** \brief Cache type. */
+typedef enum hwloc_obj_cache_type_e {
+  HWLOC_OBJ_CACHE_UNIFIED,      /**< \brief Unified cache. */
+  HWLOC_OBJ_CACHE_DATA,         /**< \brief Data cache. */
+  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache.
+				  * Only used when the HWLOC_TOPOLOGY_FLAG_ICACHES topology flag is set. */
+} hwloc_obj_cache_type_t;
+
+/** \brief Type of one side (upstream or downstream) of an I/O bridge. */
+typedef enum hwloc_obj_bridge_type_e {
+  HWLOC_OBJ_BRIDGE_HOST,	/**< \brief Host-side of a bridge, only possible upstream. */
+  HWLOC_OBJ_BRIDGE_PCI		/**< \brief PCI-side of a bridge. */
+} hwloc_obj_bridge_type_t;
+
+/** \brief Type of a OS device. */
+typedef enum hwloc_obj_osdev_type_e {
+  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device.
+				  * For instance "sda" on Linux. */
+  HWLOC_OBJ_OSDEV_GPU,		/**< \brief Operating system GPU device.
+				  * For instance ":0.0" for a GL display,
+				  * "card0" for a Linux DRM device. */
+  HWLOC_OBJ_OSDEV_NETWORK,	/**< \brief Operating system network device.
+				  * For instance the "eth0" interface on Linux. */
+  HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
+				  * For instance the "mlx4_0" InfiniBand HCA device on Linux. */
+  HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
+				  * For instance the "dma0chan0" DMA channel on Linux. */
+  HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
+				  * For instance "mic0" for a Xeon Phi (MIC) on Linux,
+				  * "opencl0d0" for a OpenCL device,
+				  * "cuda0" for a CUDA device. */
+} hwloc_obj_osdev_type_t;
+
+/** \brief Compare the depth of two object types
+ *
+ * Types shouldn't be compared as they are, since newer ones may be added in
+ * the future.  This function returns less than, equal to, or greater than zero
+ * respectively if \p type1 objects usually include \p type2 objects, are the
+ * same as \p type2 objects, or are included in \p type2 objects. If the types
+ * can not be compared (because neither is usually contained in the other),
+ * HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
+ * be compared (usually, a system contains machines which contain nodes which
+ * contain packages which contain caches, which contain cores, which contain
+ * processors).
+ *
+ * \note HWLOC_OBJ_PU will always be the deepest.
+ * \note This does not mean that the actual topology will respect that order:
+ * e.g. as of today cores may also contain caches, and packages may also contain
+ * nodes. This is thus just to be seen as a fallback comparison method.
+ */
+HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
+
+enum hwloc_compare_types_e {
+    HWLOC_TYPE_UNORDERED = INT_MAX	/**< \brief Value returned by hwloc_compare_types when types can not be compared. \hideinitializer */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_objects Object Structure and Attributes
+ * @{
+ */
+
+union hwloc_obj_attr_u;
+
+/** \brief Object memory */
+struct hwloc_obj_memory_s {
+  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in this object and its children */
+  hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+
+  /** \brief Size of array \p page_types */
+  unsigned page_types_len;
+  /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+   *
+   * The array is sorted by increasing \p size fields.
+   * It contains \p page_types_len slots.
+   */
+  struct hwloc_obj_memory_page_type_s {
+    hwloc_uint64_t size;	/**< \brief Size of pages */
+    hwloc_uint64_t count;	/**< \brief Number of pages of this size */
+  } * page_types;
+};
+
+/** \brief Structure of a topology object
+ *
+ * Applications must not modify any field except hwloc_obj.userdata.
+ */
+struct hwloc_obj {
+  /* physical information */
+  hwloc_obj_type_t type;		/**< \brief Type of object */
+  unsigned os_index;			/**< \brief OS-provided physical index number.
+					 * It is not guaranteed unique across the entire machine,
+					 * except for PUs and NUMA nodes.
+					 */
+  char *name;				/**< \brief Object description if any */
+
+  struct hwloc_obj_memory_s memory;	/**< \brief Memory attributes */
+
+  union hwloc_obj_attr_u *attr;		/**< \brief Object type-specific Attributes,
+					 * may be \c NULL if no attribute value was found */
+
+  /* global position */
+  unsigned depth;			/**< \brief Vertical index in the hierarchy.
+					 * If the topology is symmetric, this is equal to the
+					 * parent depth plus one, and also equal to the number
+					 * of parent/child links from the root object to here.
+					 */
+  unsigned logical_index;		/**< \brief Horizontal index in the whole list of similar objects,
+					 * hence guaranteed unique across the entire machine.
+					 * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+					 */
+
+  /* cousins are all objects of the same type (and depth) across the entire topology */
+  struct hwloc_obj *next_cousin;	/**< \brief Next object of same type and depth */
+  struct hwloc_obj *prev_cousin;	/**< \brief Previous object of same type and depth */
+
+  /* children of the same parent are siblings, even if they may have different type and depth */
+  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (system object) */
+  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's I/O or Misc children list. */
+  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent */
+  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent */
+
+  /* children array below this object (except I/O and Misc children) */
+  unsigned arity;			/**< \brief Number of children */
+  struct hwloc_obj **children;		/**< \brief Children, \c children[0 .. arity -1] */
+  struct hwloc_obj *first_child;	/**< \brief First child */
+  struct hwloc_obj *last_child;		/**< \brief Last child */
+
+  int symmetric_subtree;		/**< \brief Set if the subtree of normal objects below this object is symmetric,
+					  * which means all children and their children have identical subtrees.
+					  * I/O and Misc children are ignored.
+					  *
+					  * If set in the topology root object, lstopo may export the topology
+					  * as a synthetic string.
+					  */
+
+  /* specific list of I/O children */
+  unsigned io_arity;			/**< \brief Number of I/O children */
+  struct hwloc_obj *io_first_child;	/**< \brief First I/O child */
+
+  /* specific list of Misc children */
+  unsigned misc_arity;			/**< \brief Number of Misc children */
+  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child */
+
+  /* cpusets and nodesets */
+  hwloc_cpuset_t cpuset;		/**< \brief CPUs covered by this object
+                                          *
+                                          * This is the set of CPUs for which there are PU objects in the topology
+                                          * under this object, i.e. which are known to be physically contained in this
+                                          * object and known how (the children path between this object and the PU
+                                          * objects).
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these CPUs may not be allowed for binding, see allowed_cpuset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_cpuset_t complete_cpuset;       /**< \brief The complete CPU set of logical processors of this object,
+                                          *
+                                          * This may include not only the same as the cpuset field, but also the CPUs for
+                                          * which topology information is unknown or incomplete, the offlines CPUS, and
+                                          * the CPUs that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag
+                                          * is not set.
+                                          * Thus no corresponding PU object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be somewhere
+                                          * under this object.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_cpuset_t allowed_cpuset;        /**< \brief The CPU set of allowed logical processors
+                                          *
+                                          * This includes the CPUs contained in this object which are allowed for
+                                          * binding, i.e. passing them to the hwloc binding functions should not return
+                                          * permission errors.  This is usually restricted by administration rules.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * allowed_cpuset may be smaller than cpuset. Otherwise they are identical.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+
+  hwloc_nodeset_t nodeset;              /**< \brief NUMA nodes covered by this object or containing this object
+                                          *
+                                          * This is the set of NUMA nodes for which there are NODE objects in the
+                                          * topology under or above this object, i.e. which are known to be physically
+                                          * contained in this object or containing it and known how (the children path
+                                          * between this object and the NODE objects).
+                                          *
+                                          * In the end, these nodes are those that are close to the current object.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these nodes may not be allowed for allocation, see allowed_nodeset.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p nodeset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_nodeset_t complete_nodeset;     /**< \brief The complete NUMA node set of this object,
+                                          *
+                                          * This may include not only the same as the nodeset field, but also the NUMA
+                                          * nodes for which topology information is unknown or incomplete, the offlines
+                                          * nodes, and the nodes that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM
+                                          * flag is not set.
+                                          * Thus no corresponding NODE object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be
+                                          * somewhere under this object.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit is set in \p complete_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_nodeset_t allowed_nodeset;      /**< \brief The set of allowed NUMA memory nodes
+                                          *
+                                          * This includes the NUMA memory nodes contained in this object which are
+                                          * allowed for memory allocation, i.e. passing them to NUMA node-directed
+                                          * memory allocation should not return permission errors. This is usually
+                                          * restricted by administration rules.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * allowed_nodeset may be smaller than nodeset. Otherwise they are identical.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p allowed_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+
+  struct hwloc_distances_s **distances;	/**< \brief Distances between all objects at same depth below this object */
+  unsigned distances_count;
+
+  struct hwloc_obj_info_s *infos;	/**< \brief Array of stringified info type=name. */
+  unsigned infos_count;			/**< \brief Size of infos array. */
+
+  /* misc */
+  void *userdata;			/**< \brief Application-given private data pointer,
+					 * initialized to \c NULL, use it as you wish.
+					 * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
+					 * if you wish to export this field to XML. */
+};
+/**
+ * \brief Convenience typedef; a pointer to a struct hwloc_obj.
+ */
+typedef struct hwloc_obj * hwloc_obj_t;
+
+/** \brief Object type-specific Attributes */
+union hwloc_obj_attr_u {
+  /** \brief Cache-specific Object Attributes */
+  struct hwloc_cache_attr_s {
+    hwloc_uint64_t size;		  /**< \brief Size of cache in bytes */
+    unsigned depth;			  /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */
+    unsigned linesize;			  /**< \brief Cache-line size in bytes. 0 if unknown */
+    int associativity;			  /**< \brief Ways of associativity,
+    					    *  -1 if fully associative, 0 if unknown */
+    hwloc_obj_cache_type_t type;          /**< \brief Cache type */
+  } cache;
+  /** \brief Group-specific Object Attributes */
+  struct hwloc_group_attr_s {
+    unsigned depth;			  /**< \brief Depth of group object */
+  } group;
+  /** \brief PCI Device specific Object Attributes */
+  struct hwloc_pcidev_attr_s {
+    unsigned short domain;
+    unsigned char bus, dev, func;
+    unsigned short class_id;
+    unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
+    unsigned char revision;
+    float linkspeed; /* in GB/s */
+  } pcidev;
+  /** \brief Bridge specific Object Attribues */
+  struct hwloc_bridge_attr_s {
+    union {
+      struct hwloc_pcidev_attr_s pci;
+    } upstream;
+    hwloc_obj_bridge_type_t upstream_type;
+    union {
+      struct {
+	unsigned short domain;
+	unsigned char secondary_bus, subordinate_bus;
+      } pci;
+    } downstream;
+    hwloc_obj_bridge_type_t downstream_type;
+    unsigned depth;
+  } bridge;
+  /** \brief OS Device specific Object Attributes */
+  struct hwloc_osdev_attr_s {
+    hwloc_obj_osdev_type_t type;
+  } osdev;
+};
+
+/** \brief Distances between objects
+ *
+ * One object may contain a distance structure describing distances
+ * between all its descendants at a given relative depth. If the
+ * containing object is the root object of the topology, then the
+ * distances are available for all objects in the machine.
+ *
+ * If the \p latency pointer is not \c NULL, the pointed array contains
+ * memory latencies (non-zero values), see below.
+ *
+ * In the future, some other types of distances may be considered.
+ * In these cases, \p latency may be \c NULL.
+ */
+struct hwloc_distances_s {
+  unsigned relative_depth;	/**< \brief Relative depth of the considered objects
+				 * below the object containing this distance information. */
+  unsigned nbobjs;		/**< \brief Number of objects considered in the matrix.
+				 * It is the number of descendant objects at \p relative_depth
+				 * below the containing object.
+				 * It corresponds to the result of hwloc_get_nbobjs_inside_cpuset_by_depth(). */
+
+  float *latency;		/**< \brief Matrix of latencies between objects, stored as a one-dimension array.
+				 * May be \c NULL if the distances considered here are not latencies.
+				 *
+				 * Unless defined by the user, this currently contains latencies
+				 * between NUMA nodes (as reported in the System Locality Distance Information Table
+				 * (SLIT) in the ACPI specification), which may or may not be accurate.
+				 * It corresponds to the latency for accessing the memory of one node
+				 * from a core in another node.
+				 *
+				 * Values are normalized to get 1.0 as the minimal value in the matrix.
+				 * Latency from i-th to j-th object is stored in slot i*nbobjs+j.
+				 */
+  float latency_max;		/**< \brief The maximal value in the latency matrix. */
+  float latency_base;		/**< \brief The multiplier that should be applied to latency matrix
+				 * to retrieve the original OS-provided latencies.
+				 * Usually 10 on Linux since ACPI SLIT uses 10 for local latency.
+				 */
+};
+
+/** \brief Object info */
+struct hwloc_obj_info_s {
+  char *name;	/**< \brief Info name */
+  char *value;	/**< \brief Info value */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_creation Topology Creation and Destruction
+ * @{
+ */
+
+struct hwloc_topology;
+/** \brief Topology context
+ *
+ * To be initialized with hwloc_topology_init() and built with hwloc_topology_load().
+ */
+typedef struct hwloc_topology * hwloc_topology_t;
+
+/** \brief Allocate a topology context.
+ *
+ * \param[out] topologyp is assigned a pointer to the new allocated context.
+ *
+ * \return 0 on success, -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
+
+/** \brief Build the actual topology
+ *
+ * Build the actual topology once initialized with hwloc_topology_init() and
+ * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines.
+ * No other routine may be called earlier using this topology context.
+ *
+ * \param topology is the topology to be loaded with objects.
+ *
+ * \return 0 on success, -1 on error.
+ *
+ * \note On failure, the topology is reinitialized. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ *
+ * \note This function may be called only once per topology.
+ *
+ * \sa hwlocality_configuration and hwlocality_setsource
+ */
+HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
+
+/** \brief Terminate and free a topology context
+ *
+ * \param topology is the topology to be freed
+ */
+HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
+
+/** \brief Duplicate a topology.
+ *
+ * The entire topology structure as well as its objects
+ * are duplicated into a new one.
+ *
+ * This is useful for keeping a backup while modifying a topology.
+ */
+HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
+
+/** \brief Run internal checks on a topology structure
+ *
+ * The program aborts if an inconsistency is detected in the given topology.
+ *
+ * \param topology is the topology to be checked
+ *
+ * \note This routine is only useful to developers.
+ *
+ * \note The input topology should have been previously loaded with
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_levels Object levels, depths and types
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Get the depth of the hierarchical tree of objects.
+ *
+ * This is the depth of HWLOC_OBJ_PU objects plus one.
+ */
+HWLOC_DECLSPEC unsigned hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type.
+ *
+ * If no object of this type is present on the underlying architecture, or if
+ * the OS doesn't provide this kind of information, the function returns
+ * HWLOC_TYPE_DEPTH_UNKNOWN.
+ *
+ * If type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * If some objects of the given type exist in different levels,
+ * for instance L1 and L2 caches, or L1i and L1d caches,
+ * the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ * See hwloc_get_cache_type_depth() in hwloc/helper.h to better handle this
+ * case.
+ *
+ * If an I/O object type is given, the function returns a virtual value
+ * because I/O objects are stored in special levels that are not CPU-related.
+ * This virtual depth may be passed to other hwloc functions such as
+ * hwloc_get_obj_by_depth() but it should not be considered as an actual
+ * depth by the application. In particular, it should not be compared with
+ * any other object depth or with the entire topology depth.
+ */
+HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
+
+enum hwloc_get_type_depth_e {
+    HWLOC_TYPE_DEPTH_UNKNOWN = -1,    /**< \brief No object of given type exists in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_BRIDGE = -3,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_PCI_DEVICE = -4, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_OS_DEVICE = -5,  /**< \brief Virtual depth for software device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MISC = -6        /**< \brief Virtual depth for Misc object. \hideinitializer */
+};
+
+/** \brief Returns the depth of objects of type \p type or below
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically found
+ * inside \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type or above
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically
+ * containing \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the type of objects at depth \p depth.
+ *
+ * \return -1 if depth \p depth does not exist.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level at depth \p depth.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level type \p type
+ *
+ * If no object for that type exists, 0 is returned.
+ * If there are several levels with objects of that type, -1 is returned.
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the top-object of the topology-tree.
+ *
+ * Its type is typically ::HWLOC_OBJ_MACHINE but it could be different
+ * for complex topologies.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, unsigned depth, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx with type \p type
+ *
+ * If no object for that type exists, \c NULL is returned.
+ * If there are several levels with objects of that type, \c NULL is returned
+ * and ther caller may fallback to hwloc_get_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the next object at depth \p depth.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev);
+
+/** \brief Returns the next object of type \p type.
+ *
+ * If \p prev is \c NULL, return the first object at type \p type.  If
+ * there are multiple or no depth for given type, return \c NULL and
+ * let the caller fallback to hwloc_get_next_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_strings Manipulating Object Type, Sets and Attributes as Strings
+ * @{
+ */
+
+/** \brief Return a stringified topology object type */
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
+
+/** \brief Return an object type and attributes from a type string.
+ *
+ * Convert strings such as "Package" or "Cache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
+ *
+ * Types that have specific attributes, for instance caches and groups,
+ * may be returned in \p depthattrp and \p typeattrp. They are ignored
+ * when these pointers are \c NULL.
+ *
+ * For instance "L2i" or "L2iCache" would return
+ * type HWLOC_OBJ_CACHE in \p typep, 2 in \p depthattrp,
+ * and HWLOC_OBJ_CACHE_TYPE_INSTRUCTION in \p typeattrp
+ * (this last pointer should point to a hwloc_obj_cache_type_t).
+ * "Group3" would return type HWLOC_OBJ_GROUP type and 3 in \p depthattrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p typeattrd is only filled if the size specified in \p typeattrsize
+ * is large enough. It is currently only used for caches, and the required
+ * size is at least the size of hwloc_obj_cache_type_t.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_of_string()
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_sscanf(const char *string,
+					 hwloc_obj_type_t *typep,
+					 int *depthattrp,
+					 void *typeattrp, size_t typeattrsize);
+
+/** \brief Stringify the type of a given topology object into a human-readable form.
+ *
+ * It differs from hwloc_obj_type_string() because it prints type attributes such
+ * as cache depth and type.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj,
+				   int verbose);
+
+/** \brief Stringify the attributes of a given topology object into a human-readable form.
+ *
+ * Attribute values are separated by \p separator.
+ *
+ * Only the major attributes are printed in non-verbose mode.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * __hwloc_restrict separator,
+				   int verbose);
+
+/** \brief Stringify the cpuset containing a set of objects.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_cpuset_snprintf(char * __hwloc_restrict str, size_t size, size_t nobj, const hwloc_obj_t * __hwloc_restrict objs);
+
+/** \brief Search the given key name in object infos and return the corresponding value.
+ *
+ * If multiple keys match the given name, only the first one is returned.
+ *
+ * \return \c NULL if no such key exists.
+ */
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
+
+/** \brief Add the given info name and value pair to the given object.
+ *
+ * The info is appended to the existing info array even if another key
+ * with the same name already exists.
+ *
+ * The input strings are copied before being added in the object infos.
+ *
+ * \note This function may be used to enforce object colors in the lstopo
+ * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
+ *
+ * \note If \p value contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_cpubinding CPU binding
+ *
+ * It is often useful to call hwloc_bitmap_singlify() first so that a single CPU
+ * remains in the set. This way, the process will not even migrate between
+ * different CPUs inside the given set.
+ * Some operating systems also only support that kind of binding.
+ *
+ * Some operating systems do not provide all hwloc-supported
+ * mechanisms to bind processes, threads, etc.
+ * hwloc_topology_get_support() may be used to query about the actual CPU
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1.
+ * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object
+ * processes/threads. errno is set to \c EXDEV when the requested cpuset
+ * can not be enforced (e.g. some systems only allow one CPU, and some
+ * other systems only allow one NUMA node).
+ *
+ * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable version that should be preferred over the others,
+ * whenever possible, is the following one which just binds the current program,
+ * assuming it is single-threaded:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, 0),
+ * \endcode
+ *
+ * If the program may be multithreaded, the following one should be preferred
+ * to only bind the current thread:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD),
+ * \endcode
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note To unbind, just call the binding function with either a full cpuset or
+ * a cpuset equal to the system cpuset.
+ *
+ * \note On some operating systems, CPU binding may have effects on memory binding, see
+ * ::HWLOC_CPUBIND_NOMEMBIND
+ *
+ * \note Running lstopo --top or hwloc-ps can be a very convenient tool to check
+ * how binding actually happened.
+ * @{
+ */
+
+/** \brief Process/Thread binding flags.
+ *
+ * These bit flags can be used to refine the binding policy.
+ *
+ * The default (0) is to bind the current process, assumed to be
+ * single-threaded, in a non-strict way.  This is the most portable
+ * way to bind as all operating systems usually provide it.
+ *
+ * \note Not all systems support all kinds of binding.  See the
+ * "Detailed Description" section of \ref hwlocality_cpubinding for a
+ * description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Bind all threads of the current (possibly) multithreaded process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_PROCESS = (1<<0),
+
+  /** \brief Bind current thread of current process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_THREAD = (1<<1),
+
+  /** \brief Request for strict binding from the OS.
+   *
+   * By default, when the designated CPUs are all busy while other
+   * CPUs are idle, operating systems may execute the thread/process
+   * on those other CPUs instead of the designated CPUs, to let them
+   * progress anyway.  Strict binding means that the thread/process
+   * will _never_ execute on other cpus than the designated CPUs, even
+   * when those are busy with other tasks and other CPUs are idle.
+   *
+   * \note Depending on the operating system, strict binding may not
+   * be possible (e.g., the OS does not implement it) or not allowed
+   * (e.g., for an administrative reasons), and the function will fail
+   * in that case.
+   *
+   * When retrieving the binding of a process, this flag checks
+   * whether all its threads  actually have the same binding. If the
+   * flag is not given, the binding of each thread will be
+   * accumulated.
+   *
+   * \note This flag is meaningless when retrieving the binding of a
+   * thread.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_STRICT = (1<<2),
+
+  /** \brief Avoid any effect on memory binding
+   *
+   * On some operating systems, some CPU binding function would also
+   * bind the memory on the corresponding NUMA node.  It is often not
+   * a problem for the application, but if it is, setting this flag
+   * will make hwloc avoid using OS functions that would also bind
+   * memory.  This will however reduce the support of CPU bindings,
+   * i.e. potentially return -1 with errno set to ENOSYS in some
+   * cases.
+   *
+   * This flag is only meaningful when used with functions that set
+   * the CPU binding.  It is ignored when used with functions that get
+   * CPU binding information.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_NOMEMBIND = (1<<3)
+} hwloc_cpubind_flags_t;
+
+/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get current process or thread binding.
+ *
+ * Writes into \p set the physical cpuset which the process or thread (according to \e
+ * flags) was last bound to.
+ */
+HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding is applied to that specific thread.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get the current physical binding of process \p pid.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding for that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+#ifdef hwloc_thread_t
+/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
+#endif
+
+#ifdef hwloc_thread_t
+/** \brief Get the current physical binding of thread \p tid.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
+#endif
+
+/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \p flags can include either HWLOC_CPUBIND_PROCESS or HWLOC_CPUBIND_THREAD to
+ * specify whether the query should be for the whole process (union of all CPUs
+ * on which all threads are running), or only the current thread. If the
+ * process is single-threaded, flags can be set to zero to let hwloc use
+ * whichever method is available on the underlying OS.
+ */
+HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Get the last physical CPU where a process ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the last CPU location of that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_membinding Memory binding
+ *
+ * Memory binding can be done three ways:
+ *
+ * - explicit memory allocation thanks to hwloc_alloc_membind() and friends:
+ *   the binding will have effect on the memory allocated by these functions.
+ * - implicit memory binding through binding policy: hwloc_set_membind() and
+ *   friends only define the current policy of the process, which will be
+ *   applied to the subsequent calls to malloc() and friends.
+ * - migration of existing memory ranges, thanks to hwloc_set_area_membind()
+ *   and friends, which move already-allocated data.
+ *
+ * Not all operating systems support all three ways.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1.
+ * \p errno will be set to \c ENOSYS when the system does support
+ * the specified action or policy
+ * (e.g., some systems only allow binding memory on a per-thread
+ * basis, whereas other systems only allow binding memory for all
+ * threads in a process).
+ * \p errno will be set to EXDEV when the requested cpuset can not be enforced
+ * (e.g., some systems only allow binding memory to a single NUMA node).
+ *
+ * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable form that should be preferred over the others
+ * whenever possible is as follows.
+ * It allocates some memory hopefully bound to the specified set.
+ * To do so, hwloc will possibly have to change the current memory
+ * binding policy in order to actually get the memory bound, if the OS
+ * does not provide any other way to simply allocate bound memory
+ * without changing the policy for all allocations. That is the
+ * difference with hwloc_alloc_membind(), which will never change the
+ * current memory binding policy.
+ *
+ * \code
+ * hwloc_alloc_membind_policy(topology, size, set,
+ *                            HWLOC_MEMBIND_BIND, 0);
+ * \endcode
+ *
+ * Each hwloc memory binding function is available in two forms: one
+ * that takes a CPU set argument and another that takes a NUMA memory
+ * node set argument (see \ref hwlocality_object_sets and \ref
+ * hwlocality_bitmap for a discussion of CPU sets and NUMA memory node
+ * sets).  The names of the latter form end with _nodeset.  It is also
+ * possible to convert between CPU set and node set using
+ * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note On some operating systems, memory binding affects the CPU
+ * binding; see ::HWLOC_MEMBIND_NOCPUBIND
+ * @{
+ */
+
+/** \brief Memory binding policy.
+ *
+ * These constants can be used to choose the binding policy.  Only one policy can
+ * be used at a time (i.e., the values cannot be OR'ed together).
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding policy support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Reset the memory allocation policy to the system default.
+   * Depending on the operating system, this may correspond to
+   * HWLOC_MEMBIND_FIRSTTOUCH (Linux),
+   * or HWLOC_MEMBIND_BIND (AIX, HP-UX, OSF, Solaris, Windows).
+   * \hideinitializer */
+  HWLOC_MEMBIND_DEFAULT =	0,
+
+  /** \brief Allocate memory
+   * but do not immediately bind it to a specific locality. Instead,
+   * each page in the allocation is bound only when it is first
+   * touched. Pages are individually bound to the local NUMA node of
+   * the first thread that touches it. If there is not enough memory
+   * on the node, allocation may be done in the specified cpuset
+   * before allocating on other nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_FIRSTTOUCH =	1,
+
+  /** \brief Allocate memory on the specified nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_BIND =		2,
+
+  /** \brief Allocate memory on the given nodes in an interleaved
+   * / round-robin manner.  The precise layout of the memory across
+   * multiple NUMA nodes is OS/system specific. Interleaving can be
+   * useful when threads distributed across the specified NUMA nodes
+   * will all be accessing the whole memory range concurrently, since
+   * the interleave will then balance the memory references.
+   * \hideinitializer */
+  HWLOC_MEMBIND_INTERLEAVE =	3,
+
+  /** \brief Replicate memory on the given nodes; reads from this
+   * memory will attempt to be serviced from the NUMA node local to
+   * the reading thread. Replicating can be useful when multiple
+   * threads from the specified NUMA nodes will be sharing the same
+   * read-only data.
+   *
+   * This policy can only be used with existing memory allocations
+   * (i.e., the hwloc_set_*membind*() functions); it cannot be used
+   * with functions that allocate new memory (i.e., the hwloc_alloc*()
+   * functions).
+   * \hideinitializer */
+  HWLOC_MEMBIND_REPLICATE =	4,
+
+  /** \brief For each page bound with this policy, by next time
+   * it is touched (and next time only), it is moved from its current
+   * location to the local NUMA node of the thread where the memory
+   * reference occurred (if it needs to be moved at all).
+   * \hideinitializer */
+  HWLOC_MEMBIND_NEXTTOUCH =	5,
+
+  /** \brief Returned by get_membind() functions when multiple
+   * threads or parts of a memory area have differing memory binding
+   * policies.
+   * \hideinitializer */
+  HWLOC_MEMBIND_MIXED = -1
+} hwloc_membind_policy_t;
+
+/** \brief Memory binding flags.
+ *
+ * These flags can be used to refine the binding policy.
+ * All flags can be logically OR'ed together with the exception of
+ * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD;
+ * these two flags are mutually exclusive.
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Set policy for all threads of the specified (possibly
+   * multithreaded) process.  This flag is mutually exclusive with
+   * ::HWLOC_MEMBIND_THREAD.
+   * \hideinitializer */
+  HWLOC_MEMBIND_PROCESS =       (1<<0),
+
+ /** \brief Set policy for a specific thread of the current process.
+  * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS.
+  * \hideinitializer */
+  HWLOC_MEMBIND_THREAD =        (1<<1),
+
+ /** Request strict binding from the OS.  The function will fail if
+  * the binding can not be guaranteed / completely enforced.
+  *
+  * This flag has slightly different meanings depending on which
+  * function it is used with.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_STRICT =        (1<<2),
+
+ /** \brief Migrate existing allocated memory.  If the memory cannot
+  * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error
+  * will be returned.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_MIGRATE =       (1<<3),
+
+  /** \brief Avoid any effect on CPU binding.
+   *
+   * On some operating systems, some underlying memory binding
+   * functions also bind the application to the corresponding CPU(s).
+   * Using this flag will cause hwloc to avoid using OS functions that
+   * could potentially affect CPU bindings.  Note, however, that using
+   * NOCPUBIND may reduce hwloc's overall memory binding
+   * support. Specifically: some of hwloc's memory binding functions
+   * may fail with errno set to ENOSYS when used with NOCPUBIND.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_NOCPUBIND =     (1<<4)
+} hwloc_membind_flags_t;
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) near the specified physical \p
+ * cpuset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the values are returned in \p nodeset and \p
+ * policy.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), \p nodeset is set to
+ * the logical OR of all threads' default nodeset.  If all threads'
+ * default policies are the same, \p policy is set to that policy.  If
+ * they are different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy; they are returned in \p nodeset and
+ * \p policy, respectively.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread (the locality is returned in \p cpuset as
+ * CPUs near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the policy is returned in \p policy.  \p
+ * cpuset is set to the union of CPUs near the NUMA node(s) in the
+ * nodeset.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default nodeset
+ * from each thread is logically OR'ed together.  \p cpuset is set to
+ * the union of CPUs near the NUMA node(s) in the resulting nodeset.
+ * If all threads' default policies are the same, \p policy is set to
+ * that policy.  If they are different, \p policy is set to
+ * ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy.  The policy is returned in \p
+ * policy; \p cpuset is set to the union of CPUs near the NUMA node(s)
+ * in the \p nodeset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) near the specified physical \p cpuset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the values are returned in \p
+ * nodeset and \p policy.
+ *
+ * Otherwise, \p nodeset is set to the logical OR of all threads'
+ * default nodeset.  If all threads' default policies are the same, \p
+ * policy is set to that policy.  If they are different, \p policy is
+ * set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process (the locality is returned in \p cpuset as CPUs
+ * near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the policy is returned in \p
+ * policy.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * Otherwise, the default nodeset from each thread is logically OR'ed
+ * together.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the resulting nodeset.  If all threads' default policies
+ * are the same, \p policy is set to that policy.  If they are
+ * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) near physical \p cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the nodeset and policy are
+ * returned in \p nodeset and \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, \p nodeset is set to the
+ * union of all NUMA node(s) containing pages in the address range.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the policy is returned in
+ * \p policy.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated.  \p
+ * cpuset is then set to the CPUs near the NUMA node(s) in this union.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Allocate some memory
+ *
+ * This is equivalent to malloc(), except that it tries to allocate
+ * page-aligned memory from the OS.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on memory nodes near the given physical cpuset \p cpuset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the given nodeset \p nodeset
+ *
+ * This is similar to hwloc_alloc_membind except that it is allowed to change
+ * the current memory binding policy, thus providing more binding support, at
+ * the expense of changing the current state.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the memory nodes near given cpuset \p cpuset
+ *
+ * This is similar to hwloc_alloc_membind_policy_nodeset, but for a given cpuset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Free memory that was previously allocated by hwloc_alloc()
+ * or hwloc_alloc_membind().
+ */
+HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * If none of the functions below is called, the default is to detect all the objects
+ * of the machine that the caller is allowed to access.
+ *
+ * This default behavior may also be modified through environment variables
+ * if the application did not modify it already.
+ * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML
+ * file as if hwloc_topology_set_xml() had been called.
+ * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
+ * hwloc_topology_set_synthetic() had been called.
+ * Setting HWLOC_FSROOT switches to reading the topology from the specified Linux
+ * filesystem root.
+ *
+ * Finally, HWLOC_THISSYSTEM enforces the return value of
+ * hwloc_topology_is_thissystem().
+ *
+ * @{
+ */
+
+/** \brief Change which pid the topology is viewed from
+ *
+ * On some systems, processes may have different views of the machine, for
+ * instance the set of allowed CPUs. By default, hwloc exposes the view from
+ * the current process. Calling hwloc_topology_set_pid() permits to make it
+ * expose the topology of the machine from the point of view of another
+ * process.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * support this feature.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
+
+/** \brief Enable synthetic topology.
+ *
+ * Gather topology information from the given \p description,
+ * a space-separated string of numbers describing
+ * the arity of each level.
+ * Each number may be prefixed with a type and a colon to enforce the type
+ * of a level.  If only some level types are enforced, hwloc will try to
+ * choose the other types according to usual topologies, but it may fail
+ * and you may have to specify more level types manually.
+ * See also the \ref synthetic.
+ *
+ * Setting the environment variable HWLOC_SYNTHETIC
+ * may also result in this behavior.
+ *
+ * If \p description was properly parsed and describes a valid topology
+ * configuration, this function returns 0.
+ * Otherwise -1 is returned and errno is set to EINVAL.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.
+ *
+ * \note On success, the synthetic component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description);
+
+/** \brief Enable XML-file based topology.
+ *
+ * Gather topology information from the XML file given at \p xmlpath.
+ * Setting the environment variable HWLOC_XMLFILE may also result in this behavior.
+ * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h,
+ * or lstopo file.xml.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML file.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
+
+/** \brief Enable XML based topology using a memory buffer (instead of
+ * a file, as with hwloc_topology_set_xml()).
+ *
+ * Gather topology information from the XML memory buffer given at \p
+ * buffer and of length \p size.  This buffer may have been filled
+ * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_configuration Topology Detection Configuration and Query
+ *
+ * Several functions can optionally be called between hwloc_topology_init() and
+ * hwloc_topology_load() to configure how the detection should be performed,
+ * e.g. to ignore some objects types, define a synthetic topology, etc.
+ *
+ * @{
+ */
+
+/** \brief Flags to be set onto a topology context before load.
+ *
+ * Flags should be given to hwloc_topology_set_flags().
+ * They may also be returned by hwloc_topology_get_flags().
+ */
+enum hwloc_topology_flags_e {
+ /** \brief Detect the whole system, ignore reservations.
+   *
+   * Gather all resources, even if some were disabled by the administrator.
+   * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
+   *
+   * When this flag is set, each object has allowed_cpuset <= cpuset <= complete_cpuset.
+   * Otherwise allowed_cpuset = cpuset <= complete_cpuset.
+   * The same applies to nodesets.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM = (1UL<<0),
+
+ /** \brief Assume that the selected backend provides the topology for the
+   * system on which we are running.
+   *
+   * This forces hwloc_topology_is_thissystem to return 1, i.e. makes hwloc assume that
+   * the selected backend provides the topology for the system on which we are running,
+   * even if it is not the OS-specific backend but the XML backend for instance.
+   * This means making the binding functions actually call the OS-specific
+   * system calls and really do binding, while the XML backend would otherwise
+   * provide empty hooks just returning success.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM may also result in the
+   * same behavior.
+   *
+   * This can be used for efficiency reasons to first detect the topology once,
+   * save it to an XML file, and quickly reload it later through the XML
+   * backend, but still having binding functions actually do bind.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
+
+  /** \brief Detect PCI devices.
+   *
+   * By default, I/O devices are ignored. This flag enables I/O device
+   * detection using the pci backend. Only the common PCI devices (GPUs,
+   * NICs, block devices, ...) and host bridges (objects that connect the host
+   * objects to an I/O subsystem) will be added to the topology.
+   * Additionally it also enables MemoryDevice misc objects.
+   * Uncommon devices and other bridges (such as PCI-to-PCI bridges) will be
+   * ignored.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IO_DEVICES = (1UL<<2),
+
+  /** \brief Detect PCI bridges.
+   *
+   * This flag should be combined with HWLOC_TOPOLOGY_FLAG_IO_DEVICES to enable
+   * the detection of both common devices and of all useful bridges (bridges that
+   * have at least one device behind them).
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IO_BRIDGES = (1UL<<3),
+
+  /** \brief Detect the whole PCI hierarchy.
+   *
+   * This flag enables detection of all I/O devices (even the uncommon ones)
+   * and bridges (even those that have no device behind them) using the pci
+   * backend.
+   * This implies HWLOC_TOPOLOGY_FLAG_IO_DEVICES.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_WHOLE_IO = (1UL<<4),
+
+  /** \brief Detect instruction caches.
+   *
+   * This flag enables detection of Instruction caches,
+   * instead of only Data and Unified caches.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_ICACHES = (1UL<<5)
+};
+
+/** \brief Set OR'ed flags to non-yet-loaded topology.
+ *
+ * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
+ *
+ * If this function is called multiple times, the last invokation will erase
+ * and replace the set of flags that was previously set.
+ *
+ * The flags set in a topology may be retrieved with hwloc_topology_get_flags()
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
+
+/** \brief Get OR'ed flags of a topology.
+ *
+ * Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
+ *
+ * \return the flags previously set with hwloc_topology_set_flags().
+ */
+HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
+
+/** \brief Does the topology context come from this system?
+ *
+ * \return 1 if this topology context was built using the system
+ * running this program.
+ * \return 0 instead (for instance if using another file-system root,
+ * a XML topology file, or a synthetic topology).
+ */
+HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Flags describing actual discovery support for this topology. */
+struct hwloc_topology_discovery_support {
+  /** \brief Detecting the number of PU objects is supported. */
+  unsigned char pu;
+};
+
+/** \brief Flags describing actual PU binding support for this topology. */
+struct hwloc_topology_cpubind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_cpubind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_cpubind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_cpubind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_cpubind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_cpubind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_cpubind;
+  /** Binding a given thread only is supported.  */
+  unsigned char set_thread_cpubind;
+  /** Getting the binding of a given thread only is supported.  */
+  unsigned char get_thread_cpubind;
+  /** Getting the last processors where the whole current process ran is supported */
+  unsigned char get_thisproc_last_cpu_location;
+  /** Getting the last processors where a whole process ran is supported */
+  unsigned char get_proc_last_cpu_location;
+  /** Getting the last processors where the current thread ran is supported */
+  unsigned char get_thisthread_last_cpu_location;
+};
+
+/** \brief Flags describing actual memory binding support for this topology. */
+struct hwloc_topology_membind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_membind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_membind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_membind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_membind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_membind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_membind;
+  /** Binding a given memory area is supported. */
+  unsigned char set_area_membind;
+  /** Getting the binding of a given memory area is supported.  */
+  unsigned char get_area_membind;
+  /** Allocating a bound memory area is supported. */
+  unsigned char alloc_membind;
+  /** First-touch policy is supported. */
+  unsigned char firsttouch_membind;
+  /** Bind policy is supported. */
+  unsigned char bind_membind;
+  /** Interleave policy is supported. */
+  unsigned char interleave_membind;
+  /** Replication policy is supported. */
+  unsigned char replicate_membind;
+  /** Next-touch migration policy is supported. */
+  unsigned char nexttouch_membind;
+
+  /** Migration flags is supported. */
+  unsigned char migrate_membind;
+};
+
+/** \brief Set of flags describing actual support for this topology.
+ *
+ * This is retrieved with hwloc_topology_get_support() and will be valid until
+ * the topology object is destroyed.  Note: the values are correct only after
+ * discovery.
+ */
+struct hwloc_topology_support {
+  struct hwloc_topology_discovery_support *discovery;
+  struct hwloc_topology_cpubind_support *cpubind;
+  struct hwloc_topology_membind_support *membind;
+};
+
+/** \brief Retrieve the topology support. */
+HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
+
+/** \brief Ignore an object type.
+ *
+ * Ignore all objects from the given type.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * The top-level object of the hierarchy will never be ignored, even if this function
+ * succeeds.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore an object type if it does not bring any structure.
+ *
+ * Ignore all objects from the given type as long as they do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ * Group objects are always ignored if they do not bring any structure
+ * since they are designed to add structure to the topology.
+ * Misc objects cannot be ignored based on the structure since they are only annotations
+ * outside of the main topology structure.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore all objects that do not bring any structure.
+ *
+ * Ignore all objects that do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_all_keep_structure(hwloc_topology_t topology);
+
+/** \brief Provide a distance matrix.
+ *
+ * Provide the matrix of distances between a set of objects of the given type.
+ * The set may or may not contain all the existing objects of this type.
+ * The objects are specified by their OS/physical index in the \p os_index
+ * array. The \p distances matrix follows the same order.
+ * The distance from object i to object j in the i*nbobjs+j.
+ *
+ * A single latency matrix may be defined for each type.
+ * If another distance matrix already exists for the given type,
+ * either because the user specified it or because the OS offers it,
+ * it will be replaced by the given one.
+ * If \p nbobjs is \c 0, \p os_index is \c NULL and \p distances is \c NULL,
+ * the existing distance matrix for the given type is removed.
+ *
+ * \note Distance matrices are ignored in multi-node topologies.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology,
+						      hwloc_obj_type_t type, unsigned nbobjs,
+						      unsigned *os_index, float *distances);
+
+/** \brief Set the topology-specific userdata pointer.
+ *
+ * Each topology may store one application-given private data pointer.
+ * It is initialized to \c NULL.
+ * hwloc will never modify it.
+ *
+ * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy().
+ *
+ * This pointer is not exported to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata);
+
+/** \brief Retrieve the topology-specific userdata pointer.
+ *
+ * Retrieve the application-given private data pointer that was
+ * previously set with hwloc_topology_set_userdata().
+ */
+HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_tinker Modifying a loaded Topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_topology_restrict(). */
+enum hwloc_restrict_flags_e {
+  /** \brief Adapt distance matrices according to objects being removed during restriction.
+   * If this flag is not set, distance matrices are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES = (1<<0),
+
+  /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, Misc objects are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1<<1),
+
+  /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1<<2)
+};
+
+/** \brief Restrict the topology to the given CPU set.
+ *
+ * Topology \p topology is modified so as to remove all objects that
+ * are not included (or partially included) in the CPU set \p cpuset.
+ * All objects CPU and node sets are restricted accordingly.
+ *
+ * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
+ *
+ * \note This call may not be reverted by restricting back to a larger
+ * cpuset. Once dropped during restriction, objects may not be brought
+ * back, except by loading another topology with hwloc_topology_load().
+ *
+ * \return 0 on success.
+ *
+ * \return -1 with errno set to EINVAL if the input cpuset is invalid.
+ * The topology is not modified in this case.
+ *
+ * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * The topology is reinitialized in this case. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ */
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, unsigned long flags);
+
+/** \brief Add a MISC object as a leaf of the topology
+ *
+ * A new MISC object will be created and inserted into the topology at the
+ * position given by parent. It is appended to the list of existing Misc children,
+ * without ever adding any intermediate hierarchy level. This is useful for
+ * annotating the topology without actually changing the hierarchy.
+ *
+ * \p name will be copied to the setup the new object attributes.
+ * However, the new leaf object will not have any \p cpuset.
+ *
+ * \return the newly-created object
+ *
+ * \note If \p name contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name);
+
+/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object().
+ *
+ * This function returns a new Group object.
+ * The caller should (at least) initialize its sets before inserting the object.
+ * See hwloc_topology_insert_group_object().
+ *
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion. For instance the Type info key allows to display something else
+ * than "Group" as the type name for this object in lstopo.
+ *
+ * It is recommended not to set any other object attribute before insertion,
+ * since the Group may get discarded during insertion.
+ *
+ * The object will be destroyed if passed to hwloc_topology_insert_group_object()
+ * without any set defined.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
+
+/** \brief Add more structure to the topology by adding an intermediate Group
+ *
+ * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
+ * Then it must initialize some of its sets to specify the final location of the Group
+ * in the topology.
+ * Then the object can be passed to this function for actual insertion in the topology.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) may be used to do so.
+ * If inserting with respect to the complete topology (including disallowed, offline
+ * or unknown object), complete_cpuset and/or complete_nodeset may be used instead.
+ * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way to
+ * build the Group sets iteratively.
+ *
+ * \return The inserted object if it was properly inserted.
+ *
+ * \return An existing object if the Group was discarded because the topology already
+ * contained an object at the same location (the Group did not add any locality information).
+ * Any name/info key pair set before inserting is appended to the existing object.
+ *
+ * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
+ *
+ * \return \c NULL if Group objects are always ignored in the topology.
+ *
+ * \return \c NULL if the object was discarded because no set was initialized in the Group
+ * before insert, or all of them were empty.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
+/** \brief Setup object cpusets/nodesets by OR'ing another object's sets.
+ *
+ * For each defined cpuset or nodeset in \p src, allocate the corresponding set
+ * in \p dst and add \p src to it by OR'ing sets.
+ *
+ * This function is convenient between hwloc_topology_alloc_group_object()
+ * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
+ * that will be inserted as a new intermediate parent of several objects.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+/* high-level helpers */
+#include <hwloc/helper.h>
+
+/* inline code of some functions above */
+#include <hwloc/inlines.h>
+
+/* exporting to XML or synthetic */
+#include <hwloc/export.h>
+
+/* topology diffs */
+#include <hwloc/diff.h>
+
+/* deprecated headers */
+#include <hwloc/deprecated.h>
+
+#endif /* HWLOC_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h b/ext/hwloc/include/hwloc/autogen/config.h
new file mode 100644
index 0000000..3c243ed
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h
@@ -0,0 +1,202 @@
+/* include/hwloc/autogen/config.h.  Generated from config.h.in by configure.  */
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+#  define __hwloc_restrict restrict
+# else
+#  define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define __hwloc_inline __inline
+#else
+#  define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public.  We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+#  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+#  define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Defined to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Defined to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+#define hwloc_pid_t pid_t
+#define hwloc_thread_t pthread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+#  include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+#  ifdef hwloc_thread_t
+#    include <pthread.h>
+#  endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+#  define HWLOC_HAVE_STDINT_H 1
+
+#  include <unistd.h>
+#  ifdef HWLOC_HAVE_STDINT_H
+#    include <stdint.h>
+#  endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX likwid_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS LIKWID_
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h.in b/ext/hwloc/include/hwloc/autogen/config.h.in
new file mode 100644
index 0000000..e101b0a
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h.in
@@ -0,0 +1,201 @@
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+#  define __hwloc_restrict restrict
+# else
+#  define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define __hwloc_inline __inline
+#else
+#  define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public.  We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+#  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+#  define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#undef HWLOC_LINUX_SYS
+
+/* Defined to 1 if the CPU_SET macro works */
+#undef HWLOC_HAVE_CPU_SET
+
+/* Defined to 1 if you have the `windows.h' header. */
+#undef HWLOC_HAVE_WINDOWS_H
+#undef hwloc_pid_t
+#undef hwloc_thread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+#  include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+#  ifdef hwloc_thread_t
+#    include <pthread.h>
+#  endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+#  undef HWLOC_HAVE_STDINT_H
+
+#  include <unistd.h>
+#  ifdef HWLOC_HAVE_STDINT_H
+#    include <stdint.h>
+#  endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#undef HWLOC_SYM_TRANSFORM
+
+/* The hwloc symbol prefix */
+#undef HWLOC_SYM_PREFIX
+
+/* The hwloc symbol prefix in all caps */
+#undef HWLOC_SYM_PREFIX_CAPS
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/stamp-h2 b/ext/hwloc/include/hwloc/autogen/stamp-h2
new file mode 100644
index 0000000..804e0ac
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/stamp-h2
@@ -0,0 +1 @@
+timestamp for include/hwloc/autogen/config.h
diff --git a/ext/hwloc/include/hwloc/bitmap.h b/ext/hwloc/include/hwloc/bitmap.h
new file mode 100644
index 0000000..bb18f65
--- /dev/null
+++ b/ext/hwloc/include/hwloc/bitmap.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief The bitmap API, for use in hwloc itself.
+ */
+
+#ifndef HWLOC_BITMAP_H
+#define HWLOC_BITMAP_H
+
+#include <hwloc/autogen/config.h>
+#include <assert.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_bitmap The bitmap API
+ *
+ * The ::hwloc_bitmap_t type represents a set of objects, typically OS
+ * processors -- which may actually be hardware threads (represented
+ * by ::hwloc_cpuset_t, which is a typedef for ::hwloc_bitmap_t) -- or
+ * memory nodes (represented by ::hwloc_nodeset_t, which is also a
+ * typedef for ::hwloc_bitmap_t).
+ *
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ *
+ * \note CPU sets and nodesets are described in \ref hwlocality_object_sets.
+ *
+ * A bitmap may be of infinite size.
+ *
+ * \note Several examples of using the bitmap API are available under the
+ * doc/examples/ directory in the source tree.
+ * Regression tests such as tests/hwloc_bitmap*.c also make intensive use
+ * of this API.
+ * @{
+ */
+
+
+/** \brief
+ * Set of bits represented as an opaque pointer to an internal bitmap.
+ */
+typedef struct hwloc_bitmap_s * hwloc_bitmap_t;
+/** \brief a non-modifiable ::hwloc_bitmap_t */
+typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
+
+
+/*
+ * Bitmap allocation, freeing and copying.
+ */
+
+/** \brief Allocate a new empty bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
+
+/** \brief Allocate a new full bitmap. */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
+
+/** \brief Free bitmap \p bitmap.
+ *
+ * If \p bitmap is \c NULL, no operation is performed.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
+
+/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents.
+ *
+ * If \p bitmap is \c NULL, \c NULL is returned.
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
+
+/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
+HWLOC_DECLSPEC void hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+
+
+/*
+ * Bitmap/String Conversion
+ */
+
+/** \brief Stringify a bitmap.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the list format.
+ *
+ * Lists are comma-separated indexes or ranges.
+ * Ranges are dash separated indexes.
+ * The last range may not have a ending indexes if the bitmap is infinite.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated list string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a list string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the taskset-specific format.
+ *
+ * The taskset command manipulates bitmap strings that contain a single
+ * (possible very long) hexadecimal number starting with 0x.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+
+/*
+ * Building bitmaps.
+ */
+
+/** \brief Empty the bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
+
+/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */
+HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
+
+/** \brief Empty the bitmap \p bitmap and add bit \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Fill the bitmap \p and clear the index \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+
+/*
+ * Modifying bitmaps.
+ */
+
+/** \brief Add index \p id in bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Remove index \p id from bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Keep a single index among those set in bitmap \p bitmap
+ *
+ * May be useful before binding so that the process does not
+ * have a chance of migrating between multiple logical CPUs
+ * in the original mask.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+
+
+/*
+ * Consulting bitmaps.
+ */
+
+/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is empty */
+HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is completely full */
+HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first index is returned.
+ *
+ * \return -1 if no index with higher index is bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is bitmap, or if the index bitmap is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of
+ * indexes that are in the bitmap).
+ *
+ * \return the number of indexes that are in the bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Loop macro iterating on bitmap \p bitmap
+ * \hideinitializer
+ *
+ * \p index is the loop variable; it should be an unsigned int.  The
+ * first iteration will set \p index to the lowest index in the bitmap.
+ * Successive iterations will iterate through, in order, all remaining
+ * indexes that in the bitmap.  To be specific: each iteration will return a
+ * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
+ *
+ * The assert prevents the loop from being infinite if the bitmap is infinite.
+ */
+#define hwloc_bitmap_foreach_begin(id, bitmap) \
+do { \
+        assert(hwloc_bitmap_weight(bitmap) != -1); \
+        for (id = hwloc_bitmap_first(bitmap); \
+             (unsigned) id != (unsigned) -1; \
+             id = hwloc_bitmap_next(bitmap, id)) { \
+/** \brief End of loop. Needs a terminating ';'.
+ * \hideinitializer
+ *
+ * \sa hwloc_bitmap_foreach_begin */
+#define hwloc_bitmap_foreach_end() \
+        } \
+} while (0)
+
+
+/*
+ * Combining bitmaps.
+ */
+
+/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+
+
+/*
+ * Comparing bitmaps.
+ */
+
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects */
+HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2 */
+HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
+ *
+ * Smaller least significant bit is smaller.
+ * The empty bitmap is considered higher than anything.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order.
+ *
+ * Lexicographic comparison of bitmaps, starting for their highest indexes.
+ * Compare last indexes first, then second, etc.
+ * The empty bitmap is considered lower than anything.
+ *
+ * \note This is different from the non-existing hwloc_bitmap_compare_last()
+ * which would only compare the highest index of each bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_BITMAP_H */
diff --git a/ext/hwloc/include/hwloc/cuda.h b/ext/hwloc/include/hwloc/cuda.h
new file mode 100644
index 0000000..a02d677
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cuda.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Driver API.
+ *
+ * Applications that use both hwloc and the CUDA Driver API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDA_H
+#define HWLOC_CUDA_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cuda Interoperability with the CUDA Driver API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Driver API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device \p cudevice.
+ *
+ * Device \p cudevice must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+			      CUdevice cudevice, int *domain, int *bus, int *dev)
+{
+  CUresult cres;
+
+#if CUDA_VERSION >= 4000
+  cres = cuDeviceGetAttribute(domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+#else
+  *domain = 0;
+#endif
+  cres = cuDeviceGetAttribute(bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+  cres = cuDeviceGetAttribute(dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p cudevice.
+ *
+ * Return the CPU set describing the locality of the CUDA device \p cudevice.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cuda_get_device_osdev()
+ * and hwloc_cuda_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     CUdevice cudevice, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  int domainid, busid, deviceid;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domainid, &busid, &deviceid))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domainid, busid, deviceid);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device \p cudevice.
+ *
+ * Return the PCI device object describing the CUDA device \p cudevice.
+ * Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_pcidev(hwloc_topology_t topology, CUdevice cudevice)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to CUDA device \p cudevice.
+ *
+ * Return the hwloc OS device object that describes the given
+ * CUDA device \p cudevice. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cuda_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
+{
+	hwloc_obj_t osdev = NULL;
+	int domain, bus, dev;
+
+	if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+		return NULL;
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "cuda", 4))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && (int) pcidev->attr->pcidev.domain == domain
+		    && (int) pcidev->attr->pcidev.bus == bus
+		    && (int) pcidev->attr->pcidev.dev == dev
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+	}
+
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cudart_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDA_H */
diff --git a/ext/hwloc/include/hwloc/cudart.h b/ext/hwloc/include/hwloc/cudart.h
new file mode 100644
index 0000000..759c3cf
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cudart.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Runtime API.
+ *
+ * Applications that use both hwloc and the CUDA Runtime API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDART_H
+#define HWLOC_CUDART_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h> /* for CUDA_VERSION */
+#include <cuda_runtime_api.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cudart Interoperability with the CUDA Runtime API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Runtime API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device whose index is \p idx.
+ *
+ * Device index \p idx must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+				int idx, int *domain, int *bus, int *dev)
+{
+  cudaError_t cerr;
+  struct cudaDeviceProp prop;
+
+  cerr = cudaGetDeviceProperties(&prop, idx);
+  if (cerr) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if CUDA_VERSION >= 4000
+  *domain = prop.pciDomainID;
+#else
+  *domain = 0;
+#endif
+
+  *bus = prop.pciBusID;
+  *dev = prop.pciDeviceID;
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p idx.
+ *
+ * Return the CPU set describing the locality of the CUDA device
+ * whose index is \p idx.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cudart_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       int idx, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domain, bus, dev);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the PCI device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_pcidev(hwloc_topology_t topology, int idx)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cudart_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cuda_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDART_H */
diff --git a/ext/hwloc/include/hwloc/deprecated.h b/ext/hwloc/include/hwloc/deprecated.h
new file mode 100644
index 0000000..c4370b6
--- /dev/null
+++ b/ext/hwloc/include/hwloc/deprecated.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_DEPRECATED_H
+#define HWLOC_DEPRECATED_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* backward compat with v1.10 before Socket->Package renaming */
+#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
+/* backward compat with v1.10 before Node->NUMANode clarification */
+#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
+
+/** \brief Return an object type from the string
+ *
+ * \return -1 if unrecognized.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_obj_type_of_string (const char * string) __hwloc_attribute_pure __hwloc_attribute_deprecated;
+
+/** \brief Stringify a given topology object into a human-readable form.
+ *
+ * \note This function is deprecated in favor of hwloc_obj_type_snprintf()
+ * and hwloc_obj_attr_snprintf() since it is not very flexible and
+ * only prints physical/OS indexes.
+ *
+ * Fill string \p string up to \p size characters with the description
+ * of topology object \p obj in topology \p topology.
+ *
+ * If \p verbose is set, a longer description is used. Otherwise a
+ * short description is used.
+ *
+ * \p indexprefix is used to prefix the \p os_index attribute number of
+ * the object in the description. If \c NULL, the \c # character is used.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_snprintf(char * __hwloc_restrict string, size_t size,
+				      hwloc_topology_t topology, hwloc_obj_t obj,
+				      const char * __hwloc_restrict indexprefix, int verbose) __hwloc_attribute_deprecated;
+
+/** \brief Distribute \p n items over the topology under \p root
+ *
+ * Array \p cpuset will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under \p root, down to depth \p until (which can
+ * be INT_MAX to distribute down to the finest level).
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \note This function requires the \p root object to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+  hwloc_distrib(topology, &root, 1, set, n, until, 0);
+}
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * This is the same as hwloc_distribute, but takes an array of roots instead of
+ * just one root.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+  hwloc_distrib(topology, roots, n_roots, set, n, until, 0);
+}
+
+/** \brief Insert a misc object by parent.
+ *
+ * Identical to hwloc_topology_insert_misc_object().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+  return hwloc_topology_insert_misc_object(topology, parent, name);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/diff.h b/ext/hwloc/include/hwloc/diff.h
new file mode 100644
index 0000000..3f1beb1
--- /dev/null
+++ b/ext/hwloc/include/hwloc/diff.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2013-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Topology differences.
+ */
+
+#ifndef HWLOC_DIFF_H
+#define HWLOC_DIFF_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_diff Topology differences
+ *
+ * Applications that manipulate many similar topologies, for instance
+ * one for each node of a homogeneous cluster, may want to compress
+ * topologies to reduce the memory footprint.
+ *
+ * This file offers a way to manipulate the difference between topologies
+ * and export/import it to/from XML.
+ * Compression may therefore be achieved by storing one topology
+ * entirely while the others are only described by their differences
+ * with the former.
+ * The actual topology can be reconstructed when actually needed by
+ * applying the precomputed difference to the reference topology.
+ *
+ * This interface targets very similar nodes.
+ * Only very simple differences between topologies are actually
+ * supported, for instance a change in the memory size, the name
+ * of the object, or some info attribute.
+ * More complex differences such as adding or removing objects cannot
+ * be represented in the difference structures and therefore return
+ * errors.
+ *
+ * It means that there is no need to apply the difference when
+ * looking at the tree organization (how many levels, how many
+ * objects per level, what kind of objects, CPU and node sets, etc)
+ * and when binding to objects.
+ * However the difference must be applied when looking at object
+ * attributes such as the name, the memory size or info attributes.
+ *
+ * @{
+ */
+
+
+/** \brief Type of one object attribute difference.
+ */
+typedef enum hwloc_topology_diff_obj_attr_type_e {
+  /** \brief The object local memory is modified.
+   * The union is a hwloc_topology_diff_obj_attr_uint64_s
+   * (and the index field is ignored).
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+
+  /** \brief The object name is modified.
+   * The union is a hwloc_topology_diff_obj_attr_string_s
+   * (and the name field is ignored).
+   */
+
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+  /** \brief the value of an info attribute is modified.
+   * The union is a hwloc_topology_diff_obj_attr_string_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
+} hwloc_topology_diff_obj_attr_type_t;
+
+/** \brief One object attribute difference.
+ */
+union hwloc_topology_diff_obj_attr_u {
+  struct hwloc_topology_diff_obj_attr_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_obj_attr_type_t type;
+  } generic;
+
+  /** \brief Integer attribute modification with an optional index. */
+  struct hwloc_topology_diff_obj_attr_uint64_s {
+    /* used for storing integer attributes */
+    hwloc_topology_diff_obj_attr_type_t type;
+    hwloc_uint64_t index; /* not used for SIZE */
+    hwloc_uint64_t oldvalue;
+    hwloc_uint64_t newvalue;
+  } uint64;
+
+  /** \brief String attribute modification with an optional name */
+  struct hwloc_topology_diff_obj_attr_string_s {
+    /* used for storing name and info pairs */
+    hwloc_topology_diff_obj_attr_type_t type;
+    char *name; /* not used for NAME */
+    char *oldvalue;
+    char *newvalue;
+  } string;
+};
+
+
+/** \brief Type of one element of a difference list.
+ */
+typedef enum hwloc_topology_diff_type_e {
+  /*< \brief An object attribute was changed.
+  * The union is a hwloc_topology_diff_obj_attr_s.
+  */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
+
+  /*< \brief The difference is too complex,
+   * it cannot be represented. The difference below
+   * this object has not been checked.
+   * hwloc_topology_diff_build() will return 1.
+   *
+   * The union is a hwloc_topology_diff_too_complex_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+} hwloc_topology_diff_type_t;
+
+/** \brief One element of a difference list between two topologies.
+ */
+typedef union hwloc_topology_diff_u {
+  struct hwloc_topology_diff_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_type_t type;
+    union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */
+  } generic;
+
+  /* A difference in an object attribute. */
+  struct hwloc_topology_diff_obj_attr_s {
+    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+    union hwloc_topology_diff_u * next;
+    /* List of attribute differences for a single object */
+    unsigned obj_depth;
+    unsigned obj_index;
+    union hwloc_topology_diff_obj_attr_u diff;
+  } obj_attr;
+
+  /* A difference that is too complex. */
+  struct hwloc_topology_diff_too_complex_s {
+    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+    union hwloc_topology_diff_u * next;
+    /* Where we had to stop computing the diff in the first topology */
+    unsigned obj_depth;
+    unsigned obj_index;
+  } too_complex;
+} * hwloc_topology_diff_t;
+
+
+/** \brief Compute the difference between 2 topologies.
+ *
+ * The difference is stored as a list of hwloc_topology_diff_t entries
+ * starting at \p diff.
+ * It is computed by doing a depth-first traversal of both topology trees
+ * simultaneously.
+ *
+ * If the difference between 2 objects is too complex to be represented
+ * (for instance if some objects have different types, or different numbers
+ * of children), a special diff entry of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * is queued.
+ * The computation of the diff does not continue below these objects.
+ * So each such diff entry means that the difference between two subtrees
+ * could not be computed.
+ *
+ * \return 0 if the difference can be represented properly.
+ *
+ * \return 0 with \p diff pointing to NULL if there is no difference
+ * between the topologies.
+ *
+ * \return 1 if the difference is too complex (see above). Some entries in
+ * the list will be of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ *
+ * \return -1 on any other error.
+ *
+ * \note \p flags is currently not used. It should be 0.
+ *
+ * \note The output diff has to be freed with hwloc_topology_diff_destroy().
+ *
+ * \note The output diff can only be exported to XML or passed to
+ * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
+ * HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ *
+ * \note The output diff may be modified by removing some entries from
+ * the list. The removed entries should be freed by passing them to
+ * to hwloc_topology_diff_destroy() (possible as another list).
+*/
+HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff);
+
+/** \brief Flags to be given to hwloc_topology_diff_apply().
+ */
+enum hwloc_topology_diff_apply_flags_e {
+  /** \brief Apply topology diff in reverse direction.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0)
+};
+
+/** \brief Apply a topology diff to an existing topology.
+ *
+ * \p flags is an OR'ed set of hwloc_topology_diff_apply_flags_e.
+ *
+ * The new topology is modified in place. hwloc_topology_dup()
+ * may be used to duplicate it before patching.
+ *
+ * If the difference cannot be applied entirely, all previous applied
+ * elements are unapplied before returning.
+ *
+ * \return 0 on success.
+ *
+ * \return -N if applying the difference failed while trying
+ * to apply the N-th part of the difference. For instance -1
+ * is returned if the very first difference element could not
+ * be applied.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
+
+/** \brief Destroy a list of topology differences.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_t topology, hwloc_topology_diff_t diff);
+
+/** \brief Load a list of topology differences from a XML file.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(hwloc_topology_t topology, const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML file.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+
+/** \brief Load a list of topology differences from a XML buffer.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+  */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(hwloc_topology_t topology, const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML buffer.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/export.h b/ext/hwloc/include/hwloc/export.h
new file mode 100644
index 0000000..194ee6c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/export.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Exporting Topologies to XML or to Synthetic strings.
+ */
+
+#ifndef HWLOC_EXPORT_H
+#define HWLOC_EXPORT_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_xmlexport Exporting Topologies to XML
+ * @{
+ */
+
+/** \brief Export the topology into an XML file.
+ *
+ * This file may be loaded later through hwloc_topology_set_xml().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ *
+ * \note If \p name is "-", the XML output is sent to the standard output.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath);
+
+/** \brief Export the topology into a newly-allocated XML memory buffer.
+ *
+ * \p xmlbuffer is allocated by the callee and should be freed with
+ * hwloc_free_xmlbuffer() later in the caller.
+ *
+ * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen);
+
+/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
+HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
+
+/** \brief Set the application-specific callback for exporting object userdata
+ *
+ * The object userdata pointer is not exported to XML by default because hwloc
+ * does not know what it contains.
+ *
+ * This function lets applications set \p export_cb to a callback function
+ * that converts this opaque userdata into an exportable string.
+ *
+ * \p export_cb is invoked during XML export for each object whose
+ * \p userdata pointer is not \c NULL.
+ * The callback should use hwloc_export_obj_userdata() or
+ * hwloc_export_obj_userdata_base64() to actually export
+ * something to XML (possibly multiple times per object).
+ *
+ * \p export_cb may be set to \c NULL if userdata should not be exported to XML.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+								void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj));
+
+/** \brief Export some object userdata to XML
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ * It may be invoked one of multiple times to export some userdata to XML.
+ * The \p buffer content of length \p length is stored with optional name
+ * \p name.
+ *
+ * When importing this XML file, the import() callback (if set) will be
+ * called exactly as many times as hwloc_export_obj_userdata() was called
+ * during export(). It will receive the corresponding \p name, \p buffer
+ * and \p length arguments.
+ *
+ * \p reserved, \p topology and \p obj must be the first three parameters
+ * that were given to the export callback.
+ *
+ * Only printable characters may be exported to XML string attributes.
+ * If a non-printable character is passed in \p name or \p buffer,
+ * the function returns -1 with errno set to EINVAL.
+ *
+ * If exporting binary data, the application should first encode into
+ * printable characters only (or use hwloc_export_obj_userdata_base64()).
+ * It should also take care of portability issues if the export may
+ * be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Encode and export some object userdata to XML
+ *
+ * This function is similar to hwloc_export_obj_userdata() but it encodes
+ * the input buffer into printable characters before exporting.
+ * On import, decoding is automatically performed before the data is given
+ * to the import() callback if any.
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ *
+ * The function does not take care of portability issues if the export
+ * may be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Set the application-specific callback for importing userdata
+ *
+ * On XML import, userdata is ignored by default because hwloc does not know
+ * how to store it in memory.
+ *
+ * This function lets applications set \p import_cb to a callback function
+ * that will get the XML-stored userdata and store it in the object as expected
+ * by the application.
+ *
+ * \p import_cb is called during hwloc_topology_load() as many times as
+ * hwloc_export_obj_userdata() was called during export. The topology
+ * is not entirely setup yet. Object attributes are ready to consult,
+ * but links between objects are not.
+ *
+ * \p import_cb may be \c NULL if userdata should be ignored during import.
+ *
+ * \note \p buffer contains \p length characters followed by a null byte ('\0').
+ *
+ * \note This function should be called before hwloc_topology_load().
+ *
+ * \note The topology-specific userdata pointer is ignored when importing from XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+								void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length));
+
+/** @} */
+
+
+/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic
+ * @{
+ */
+
+/** \brief Flags for exporting synthetic topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic().
+ */
+enum hwloc_topology_export_synthetic_flags_e {
+ /** \brief Export extended types such as L2dcache as basic types such as Cache.
+  *
+  * This is required if loading the synthetic description with hwloc < 1.9.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0),
+
+ /** \brief Do not export level attributes.
+  *
+  * Ignore level attributes such as memory/cache sizes or PU indexes.
+  * This is required if loading the synthetic description with hwloc < 1.10.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1)
+};
+
+/** \brief Export the topology as a synthetic string.
+ *
+ * At most \p buflen characters will be written in \p buffer,
+ * including the terminating \0.
+ *
+ * This exported string may be given back to hwloc_topology_set_synthetic().
+ *
+ * \p flags is a OR'ed set of hwloc_topology_export_synthetic_flags_e.
+ *
+ * \return The number of characters that were written,
+ * not including the terminating \0.
+ *
+ * \return -1 if the topology could not be exported,
+ * for instance if it is not symmetric.
+ *
+ * \note I/O and Misc children are ignored, the synthetic string only
+ * describes normal children.
+ *
+ * \note A 1024-byte buffer should be large enough for exporting
+ * topologies in the vast majority of cases.
+ */
+  HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_EXPORT_H */
diff --git a/ext/hwloc/include/hwloc/gl.h b/ext/hwloc/include/hwloc/gl.h
new file mode 100644
index 0000000..4b8b3f2
--- /dev/null
+++ b/ext/hwloc/include/hwloc/gl.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenGL displays.
+ *
+ * Applications that use both hwloc and OpenGL may want to include
+ * this file so as to get topology information for OpenGL displays.
+ */
+
+#ifndef HWLOC_GL_H
+#define HWLOC_GL_H
+
+#include <hwloc.h>
+
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_gl Interoperability with OpenGL displays
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenGL displays.
+ *
+ * Only the NVIDIA display locality information is currently available,
+ * using the NV-CONTROL X11 extension and the NVCtrl library.
+ *
+ * @{
+ */
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by port and device index.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose port (server) is \p port and device (screen) is \p device.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
+					  unsigned port, unsigned device)
+{
+        unsigned x = (unsigned) -1, y = (unsigned) -1;
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2
+                    && port == x && device == y)
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by name.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose name is \p name, built as ":port.device" such as ":0.0" .
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_name(hwloc_topology_t topology,
+				   const char *name)
+{
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && !strcmp(name, osdev->name))
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the OpenGL display port and device corresponding
+ * to the given hwloc OS object.
+ *
+ * Return the OpenGL display port (server) in \p port and device (screen)
+ * in \p screen that correspond to the given hwloc OS device object.
+ * Return \c -1 if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ */
+static __hwloc_inline int
+hwloc_gl_get_display_by_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t osdev,
+			      unsigned *port, unsigned *device)
+{
+	unsigned x = -1, y = -1;
+	if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+	    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2) {
+		*port = x;
+		*device = y;
+		return 0;
+	}
+	errno = EINVAL;
+	return -1;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GL_H */
+
diff --git a/ext/hwloc/include/hwloc/glibc-sched.h b/ext/hwloc/include/hwloc/glibc-sched.h
new file mode 100644
index 0000000..1f9ba7c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/glibc-sched.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and glibc scheduling routines.
+ *
+ * Applications that use both hwloc and glibc scheduling routines such as
+ * sched_getaffinity() or pthread_attr_setaffinity_np() may want to include
+ * this file so as to ease conversion between their respective types.
+ */
+
+#ifndef HWLOC_GLIBC_SCHED_H
+#define HWLOC_GLIBC_SCHED_H
+
+#include <hwloc.h>
+#include <hwloc/helper.h>
+#include <assert.h>
+
+#if !defined _GNU_SOURCE || !defined _SCHED_H || (!defined CPU_SETSIZE && !defined sched_priority)
+#error Please make sure to include sched.h before including glibc-sched.h, and define _GNU_SOURCE before any inclusion of sched.h
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef HWLOC_HAVE_CPU_SET
+
+
+/** \defgroup hwlocality_glibc_sched Interoperability with glibc sched affinity
+ *
+ * This interface offers ways to convert between hwloc cpusets and glibc cpusets
+ * such as those manipulated by sched_getaffinity() or pthread_attr_setaffinity_np().
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p toposet into glibc sched affinity CPU set \p schedset
+ *
+ * This function may be used before calling sched_setaffinity or any other function
+ * that takes a cpu_set_t as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t hwlocset,
+				    cpu_set_t *schedset, size_t schedsetsize)
+{
+#ifdef CPU_ZERO_S
+  unsigned cpu;
+  CPU_ZERO_S(schedsetsize, schedset);
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET_S(cpu, schedsetsize, schedset);
+  hwloc_bitmap_foreach_end();
+#else /* !CPU_ZERO_S */
+  unsigned cpu;
+  CPU_ZERO(schedset);
+  assert(schedsetsize == sizeof(cpu_set_t));
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET(cpu, schedset);
+  hwloc_bitmap_foreach_end();
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** \brief Convert glibc sched affinity CPU set \p schedset into hwloc CPU set
+ *
+ * This function may be used before calling sched_setaffinity  or any other function
+ * that takes a cpu_set_t  as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t hwlocset,
+                                       const cpu_set_t *schedset, size_t schedsetsize)
+{
+  int cpu;
+#ifdef CPU_ZERO_S
+  int count;
+#endif
+  hwloc_bitmap_zero(hwlocset);
+#ifdef CPU_ZERO_S
+  count = CPU_COUNT_S(schedsetsize, schedset);
+  cpu = 0;
+  while (count) {
+    if (CPU_ISSET_S(cpu, schedsetsize, schedset)) {
+      hwloc_bitmap_set(hwlocset, cpu);
+      count--;
+    }
+    cpu++;
+  }
+#else /* !CPU_ZERO_S */
+  /* sched.h does not support dynamic cpu_set_t (introduced in glibc 2.7),
+   * assume we have a very old interface without CPU_COUNT (added in 2.6)
+   */
+  assert(schedsetsize == sizeof(cpu_set_t));
+  for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, schedset))
+      hwloc_bitmap_set(hwlocset, cpu);
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** @} */
+
+
+#endif /* CPU_SET */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/helper.h b/ext/hwloc/include/hwloc/helper.h
new file mode 100644
index 0000000..883b87d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/helper.h
@@ -0,0 +1,1249 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief High-level hwloc traversal helpers.
+ */
+
+#ifndef HWLOC_HELPER_H
+#define HWLOC_HELPER_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
+ * @{
+ */
+
+/** \brief Get the first largest object included in the given cpuset \p set.
+ *
+ * \return the first object that is included in \p set and whose parent is not.
+ *
+ * This is convenient for iterating over all largest objects within a CPU set
+ * by doing a loop getting the first largest object and clearing its CPU set
+ * from the remaining CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  if (!hwloc_bitmap_intersects(obj->cpuset, set))
+    return NULL;
+  while (!hwloc_bitmap_isincluded(obj->cpuset, set)) {
+    /* while the object intersects without being included, look at its children */
+    hwloc_obj_t child = obj->first_child;
+    while (child) {
+      if (hwloc_bitmap_intersects(child->cpuset, set))
+	break;
+      child = child->next_sibling;
+    }
+    if (!child)
+      /* no child intersects, return their father */
+      return obj;
+    /* found one intersecting child, look at its children */
+    obj = child;
+  }
+  /* obj is included, return it */
+  return obj;
+}
+
+/** \brief Get the set of largest objects covering exactly a given cpuset \p set
+ *
+ * \return the number of objects returned in \p objs.
+ */
+HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+						 hwloc_obj_t * __hwloc_restrict objs, int max);
+
+/** \brief Return the next object at depth \p depth included in CPU set \p set.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth
+ * included in \p set.  The next invokation should pass the previous
+ * return value in \p prev so as to obtain the next object in \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   unsigned depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_isincluded(next->cpuset, set))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Return the next object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_next_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					  hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      unsigned depth, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      unsigned depth, unsigned idx)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return NULL;
+  while (obj) {
+    if (hwloc_bitmap_isincluded(obj->cpuset, set)) {
+      if (count == idx)
+	return obj;
+      count++;
+    }
+    obj = obj->next_cousin;
+  }
+  return NULL;
+}
+
+/** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx);
+}
+
+/** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 unsigned depth) __hwloc_attribute_pure;
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 unsigned depth)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return 0;
+  while (obj) {
+    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+      count++;
+    obj = obj->next_cousin;
+  }
+  return count;
+}
+
+/** \brief Return the number of objects of type \p type included in CPU set \p set.
+ *
+ * If no object for that type exists inside CPU set \p set, 0 is
+ * returned.  If there are several levels with objects of that type
+ * inside CPU set \p set, -1 is returned.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+}
+
+/** \brief Return the logical index among the objects included in CPU set \p set.
+ *
+ * Consult all objects in the same level as \p obj and inside CPU set \p set
+ * in the logical order, and return the index of \p obj within them.
+ * If \p set covers the entire topology, this is the logical index of \p obj.
+ * Otherwise, this is similar to a logical index within the part of the topology
+ * defined by CPU set \p set.
+ *
+ * \note This function cannot work if obj does not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj)
+{
+  int idx = 0;
+  if (!hwloc_bitmap_isincluded(obj->cpuset, set))
+    return -1;
+  /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
+  while ((obj = obj->prev_cousin) != NULL)
+    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+      idx++;
+  return idx;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set
+ * @{
+ */
+
+/** \brief Get the child covering at least CPU set \p set.
+ *
+ * \return \c NULL if no child matches or if \p set is empty.
+ *
+ * \note This function cannot work if parent does not have a CPU set (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent)
+{
+  hwloc_obj_t child;
+  if (hwloc_bitmap_iszero(set))
+    return NULL;
+  child = parent->first_child;
+  while (child) {
+    if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset))
+      return child;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/** \brief Get the lowest object covering at least CPU set \p set
+ *
+ * \return \c NULL if no object matches or if \p set is empty.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  struct hwloc_obj *current = hwloc_get_root_obj(topology);
+  if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset))
+    return NULL;
+  while (1) {
+    hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current);
+    if (!child)
+      return current;
+    current = child;
+  }
+}
+
+/** \brief Iterate through same-depth objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object at depth \p
+ * depth covering at least part of CPU set \p set.  The next
+ * invokation should pass the previous return value in \p prev so as
+ * to obtain the next object covering at least another part of \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					    unsigned depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_intersects(set, next->cpuset))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Iterate through same-type objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object of type \p
+ * type covering at least part of CPU set \p set.  The next invokation
+ * should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of
+ * \p set.
+ *
+ * If there are no or multiple depths for type \p type, \c NULL is returned.
+ * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
+ * for each depth.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the ancestor object of \p obj at depth \p depth. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj;
+  if (obj->depth < depth)
+    return NULL;
+  while (ancestor && ancestor->depth > depth)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the ancestor object of \p obj with type \p type. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj->parent;
+  while (ancestor && ancestor->type != type)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the common parent object to objects lvl1 and lvl2 */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* the loop isn't so easy since intermediate ancestors may have
+   * different depth, causing us to alternate between using obj1->parent
+   * and obj2->parent. Also, even if at some point we find ancestors of
+   * of the same depth, their ancestors may have different depth again.
+   */
+  while (obj1 != obj2) {
+    while (obj1->depth > obj2->depth)
+      obj1 = obj1->parent;
+    while (obj2->depth > obj1->depth)
+      obj2 = obj2->parent;
+    if (obj1 != obj2 && obj1->depth == obj2->depth) {
+      obj1 = obj1->parent;
+      obj2 = obj2->parent;
+    }
+  }
+  return obj1;
+}
+
+/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \note This function cannot work if \p obj and \p subtree_root objects do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root)
+{
+  return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset);
+}
+
+/** \brief Return the next child.
+ *
+ * Return the next child among the normal children list, then among the I/O
+ * children list, then among the Misc children list.
+ *
+ * If \p prev is \c NULL, return the first child.
+ *
+ * Return \c NULL when there is no next child.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
+{
+  hwloc_obj_t obj;
+  int state = 0;
+  if (prev) {
+    if (prev->type == HWLOC_OBJ_MISC)
+      state = 2;
+    else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+      state = 1;
+    obj = prev->next_sibling;
+  } else {
+    obj = parent->first_child;
+  }
+  if (!obj && state == 0) {
+    obj = parent->io_first_child;
+    state = 1;
+  }
+  if (!obj && state == 1) {
+    obj = parent->misc_first_child;
+    state = 2;
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
+ * @{
+ */
+
+/** \brief Find the depth of cache objects matching cache depth and type.
+ *
+ * Return the depth of the topology level that contains cache objects
+ * whose attributes match \p cachedepth and \p cachetype. This function
+ * intends to disambiguate the case where hwloc_get_type_depth() returns
+ * \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ *
+ * If no cache level matches, \p HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * unique matching unified cache level is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_DATA or \p HWLOC_OBJ_CACHE_INSTRUCTION,
+ * either a matching cache, or a unified cache is returned.
+ *
+ * If \p cachetype is \c -1, it is ignored and multiple levels may
+ * match. The function returns either the depth of a uniquely matching
+ * level or \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_cache_type_depth (hwloc_topology_t topology,
+			    unsigned cachelevel, hwloc_obj_cache_type_t cachetype)
+{
+  int depth;
+  int found = HWLOC_TYPE_DEPTH_UNKNOWN;
+  for (depth=0; ; depth++) {
+    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    if (!obj)
+      break;
+    if (obj->type != HWLOC_OBJ_CACHE || obj->attr->cache.depth != cachelevel)
+      /* doesn't match, try next depth */
+      continue;
+    if (cachetype == (hwloc_obj_cache_type_t) -1) {
+      if (found != HWLOC_TYPE_DEPTH_UNKNOWN) {
+	/* second match, return MULTIPLE */
+        return HWLOC_TYPE_DEPTH_MULTIPLE;
+      }
+      /* first match, mark it as found */
+      found = depth;
+      continue;
+    }
+    if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED)
+      /* exact match (either unified is alone, or we match instruction or data), return immediately */
+      return depth;
+  }
+  /* went to the bottom, return what we found */
+  return found;
+}
+
+/** \brief Get the first cache covering a cpuset \p set
+ *
+ * \return \c NULL if no cache matches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
+  while (current) {
+    if (current->type == HWLOC_OBJ_CACHE)
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the first cache shared between an object and somebody else.
+ *
+ * \return \c NULL if no cache matches or if an invalid object is given.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t current = obj->parent;
+  if (!obj->cpuset)
+    return NULL;
+  while (current) {
+    if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
+        && current->type == HWLOC_OBJ_CACHE)
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index.
+ *
+ * This function is useful for converting a CPU set into the PU
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_cpubind()),
+ * one may iterate over the bits of the resulting CPU set with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index.
+ *
+ * This function is useful for converting a nodeset into the NUMA node
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_membind_nodeset()),
+ * one may iterate over the bits of the resulting nodeset with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Do a depth-first traversal of the topology to find and sort
+ *
+ * all objects that are at the same depth than \p src.
+ * Report in \p objs up to \p max physically closest ones to \p src.
+ *
+ * \return the number of objects returned in \p objs.
+ *
+ * \return 0 if \p src is an I/O object.
+ *
+ * \note This function requires the \p src object to have a CPU set.
+ */
+/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */
+HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max);
+
+/** \brief Find an object below another object, both specified by types and indexes.
+ *
+ * Start from the top system object and find object of type \p type1
+ * and logical index \p idx1.  Then look below this object and find another
+ * object of type \p type2 and logical index \p idx2.  Indexes are specified
+ * within the parent, not withing the entire system.
+ *
+ * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
+ * is 3, return the fourth core object below the third package.
+ *
+ * \note This function requires these objects to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2)
+{
+  hwloc_obj_t obj;
+  obj = hwloc_get_obj_by_type (topology, type1, idx1);
+  if (!obj)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2);
+}
+
+/** \brief Find an object below a chain of objects specified by types and indexes.
+ *
+ * This is a generalized version of hwloc_get_obj_below_by_type().
+ *
+ * Arrays \p typev and \p idxv must contain \p nr types and indexes.
+ *
+ * Start from the top system object and walk the arrays \p typev and \p idxv.
+ * For each type and logical index couple in the arrays, look under the previously found
+ * object to find the index-th object of the given type.
+ * Indexes are specified within the parent, not withing the entire system.
+ *
+ * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE,
+ * and idxv contains 0, 1 and 2, return the third core object below
+ * the second package below the first NUMA node.
+ *
+ * \note This function requires all these objects and the root object
+ * to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  int i;
+  for(i=0; i<nr; i++) {
+    if (!obj)
+      return NULL;
+    obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]);
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_distribute Distributing items over a topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_distrib().
+ */
+enum hwloc_distrib_flags_e {
+  /** \brief Distrib in reverse order, starting from the last objects.
+   * \hideinitializer
+   */
+  HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0)
+};
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * Array \p set will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under objects \p roots, down to depth \p until
+ * (which can be INT_MAX to distribute down to the finest level).
+ *
+ * \p n_roots is usually 1 and \p roots only contains the topology root object
+ * so as to distribute over the entire topology.
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ *
+ * \note This function replaces the now deprecated hwloc_distribute()
+ * and hwloc_distributev() functions.
+ */
+static __hwloc_inline int
+hwloc_distrib(hwloc_topology_t topology,
+	      hwloc_obj_t *roots, unsigned n_roots,
+	      hwloc_cpuset_t *set,
+	      unsigned n,
+	      unsigned until, unsigned long flags)
+{
+  unsigned i;
+  unsigned tot_weight;
+  unsigned given, givenweight;
+  hwloc_cpuset_t *cpusetp = set;
+
+  if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tot_weight = 0;
+  for (i = 0; i < n_roots; i++)
+    tot_weight += hwloc_bitmap_weight(roots[i]->cpuset);
+
+  for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
+    unsigned chunk, weight;
+    hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
+    hwloc_cpuset_t cpuset = root->cpuset;
+    weight = hwloc_bitmap_weight(cpuset);
+    if (!weight)
+      continue;
+    /* Give to root a chunk proportional to its weight.
+     * If previous chunks got rounded-up, we may get a bit less. */
+    chunk = (( (givenweight+weight) * n  + tot_weight-1) / tot_weight)
+          - ((  givenweight         * n  + tot_weight-1) / tot_weight);
+    if (!root->arity || chunk <= 1 || root->depth >= until) {
+      /* We can't split any more, put everything there.  */
+      if (chunk) {
+	/* Fill cpusets with ours */
+	unsigned j;
+	for (j=0; j < chunk; j++)
+	  cpusetp[j] = hwloc_bitmap_dup(cpuset);
+      } else {
+	/* We got no chunk, just merge our cpuset to a previous one
+	 * (the first chunk cannot be empty)
+	 * so that this root doesn't get ignored.
+	 */
+	assert(given);
+	hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset);
+      }
+    } else {
+      /* Still more to distribute, recurse into children */
+      hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags);
+    }
+    cpusetp += chunk;
+    given += chunk;
+    givenweight += weight;
+  }
+
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
+ * @{
+ */
+/** \brief Get complete CPU set
+ *
+ * \return the complete CPU set of logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_cpuset;
+}
+
+/** \brief Get topology CPU set
+ *
+ * \return the CPU set of logical processors of the system for which hwloc
+ * provides topology information. This is equivalent to the cpuset of the
+ * system object.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->cpuset;
+}
+
+/** \brief Get allowed CPU set
+ *
+ * \return the CPU set of allowed logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->allowed_cpuset;
+}
+
+/** \brief Get complete node set
+ *
+ * \return the complete node set of memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_nodeset;
+}
+
+/** \brief Get topology node set
+ *
+ * \return the node set of memory of the system for which hwloc
+ * provides topology information. This is equivalent to the nodeset of the
+ * system object.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->nodeset;
+}
+
+/** \brief Get allowed node set
+ *
+ * \return the node set of allowed memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->allowed_nodeset;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
+ *
+ * There are two semantics for converting cpusets to nodesets depending on how
+ * non-NUMA machines are handled.
+ *
+ * When manipulating nodesets for memory binding, non-NUMA machines should be
+ * considered as having a single NUMA node. The standard conversion routines
+ * below should be used so that marking the first bit of the nodeset means
+ * that memory should be bound to a non-NUMA whole machine.
+ *
+ * When manipulating nodesets as an actual list of NUMA nodes without any
+ * need to handle memory binding on non-NUMA machines, the strict conversion
+ * routines may be used instead.
+ * @{
+ */
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ *
+ * If some NUMA nodes have no CPUs at all, this function never sets their
+ * indexes in the output node set, even if a full CPU set is given in input.
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p cpuset is empty, \p nodeset will be emptied as well.
+ * Otherwise \p nodeset will be entirely filled.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a CPU set into a NUMA node set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_to_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * nodeset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(struct hwloc_topology *topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p nodeset is empty, \p cpuset will be emptied as well.
+ * Otherwise \p cpuset will be entirely filled.
+ * This is useful for manipulating memory binding sets.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+	}
+}
+
+/** \brief Convert a NUMA node set into a CPU set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_from_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * cpuset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(struct hwloc_topology *topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL)
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances Manipulating Distances
+ * @{
+ */
+
+/** \brief Get the distances between all objects at the given depth.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects at the given depth.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_depth(hwloc_topology_t topology, unsigned depth)
+{
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+  unsigned i;
+  for(i=0; i<root->distances_count; i++)
+    if (root->distances[i]->relative_depth == depth)
+      return root->distances[i];
+  return NULL;
+}
+
+/** \brief Get the distances between all objects of a given type.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects of the given type.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth < 0)
+    return NULL;
+  return hwloc_get_whole_distance_matrix_by_depth(topology, depth);
+}
+
+/** \brief Get distances for the given depth and covering some objects
+ *
+ * Return a distance matrix that describes depth \p depth and covers at
+ * least object \p obj and all its children.
+ *
+ * When looking for the distance between some objects, a common ancestor should
+ * be passed in \p obj.
+ *
+ * \p firstp is set to logical index of the first object described by the matrix.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ */
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_distance_matrix_covering_obj_by_depth(hwloc_topology_t topology,
+						hwloc_obj_t obj, unsigned depth,
+						unsigned *firstp)
+{
+  if (!obj->cpuset)
+    return NULL;
+  while (obj) {
+    unsigned i;
+    for(i=0; i<obj->distances_count; i++)
+      if (obj->distances[i]->relative_depth == depth - obj->depth) {
+	if (!obj->distances[i]->nbobjs)
+	  continue;
+	*firstp = hwloc_get_next_obj_inside_cpuset_by_depth(topology, obj->cpuset, depth, NULL)->logical_index;
+	return obj->distances[i];
+      }
+    obj = obj->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the latency in both directions between two objects.
+ *
+ * Look at ancestor objects from the bottom to the top until one of them
+ * contains a distance matrix that matches the objects exactly.
+ *
+ * \p latency gets the value from object \p obj1 to \p obj2, while
+ * \p reverse_latency gets the reverse-direction value, which
+ * may be different on some architectures.
+ *
+ * \return -1 if no ancestor contains a matching latency matrix.
+ */
+static __hwloc_inline int
+hwloc_get_latency(hwloc_topology_t topology,
+		   hwloc_obj_t obj1, hwloc_obj_t obj2,
+		   float *latency, float *reverse_latency)
+{
+  hwloc_obj_t ancestor;
+  const struct hwloc_distances_s * distances;
+  unsigned first_logical ;
+
+  if (obj1->depth != obj2->depth) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  ancestor = hwloc_get_common_ancestor_obj(topology, obj1, obj2);
+  distances = hwloc_get_distance_matrix_covering_obj_by_depth(topology, ancestor, obj1->depth, &first_logical);
+  if (distances && distances->latency) {
+    const float * latency_matrix = distances->latency;
+    unsigned nbobjs = distances->nbobjs;
+    unsigned l1 = obj1->logical_index - first_logical;
+    unsigned l2 = obj2->logical_index - first_logical;
+    *latency = latency_matrix[l1*nbobjs+l2];
+    *reverse_latency = latency_matrix[l2*nbobjs+l1];
+    return 0;
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_advanced_io Finding I/O objects
+ * @{
+ */
+
+/** \brief Get the first non-I/O ancestor object.
+ *
+ * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
+ * object. This regular object may then be used for binding because
+ * its locality is the same as \p ioobj.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t ioobj)
+{
+  hwloc_obj_t obj = ioobj;
+  while (obj && !obj->cpuset) {
+    obj = obj->parent;
+  }
+  return obj;
+}
+
+/** \brief Get the next PCI device in the system.
+ *
+ * \return the first PCI device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given domain, bus device and function PCI bus id.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
+			  unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
+    if (obj->attr->pcidev.domain == domain
+	&& obj->attr->pcidev.bus == bus
+	&& obj->attr->pcidev.dev == dev
+	&& obj->attr->pcidev.func == func)
+      return obj;
+  }
+  return NULL;
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given as a string xxxx:yy:zz.t or yy:zz.t.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
+{
+  unsigned domain = 0; /* default */
+  unsigned bus, dev, func;
+
+  if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3
+      && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func);
+}
+
+/** \brief Get the next OS device in the system.
+ *
+ * \return the first OS device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev);
+}
+
+/** \brief Get the next bridge in the system.
+ *
+ * \return the first bridge if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev);
+}
+
+/* \brief Checks whether a given bridge covers a given PCI bus.
+ */
+static __hwloc_inline int
+hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
+			   unsigned domain, unsigned bus)
+{
+  return bridge->type == HWLOC_OBJ_BRIDGE
+    && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+    && bridge->attr->bridge.downstream.pci.domain == domain
+    && bridge->attr->bridge.downstream.pci.secondary_bus <= bus
+    && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
+}
+
+/** \brief Find the hostbridge that covers the given PCI bus.
+ *
+ * This is useful for finding the locality of a bus because
+ * it is the hostbridge parent cpuset.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_hostbridge_by_pcibus(hwloc_topology_t topology,
+			       unsigned domain, unsigned bus)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
+    if (hwloc_bridge_covers_pcibus(obj, domain, bus)) {
+      /* found bridge covering this pcibus, make sure it's a hostbridge */
+      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST);
+      assert(obj->parent->type != HWLOC_OBJ_BRIDGE);
+      assert(obj->parent->cpuset);
+      return obj;
+    }
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/inlines.h b/ext/hwloc/include/hwloc/inlines.h
new file mode 100644
index 0000000..7281750
--- /dev/null
+++ b/ext/hwloc/include/hwloc/inlines.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_INLINES_H
+#define HWLOC_INLINES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the highest existing level with type order >= */
+  for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
+      return depth+1;
+
+  /* Shouldn't ever happen, as there is always a SYSTEM level with lower order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the lowest existing level with type order <= */
+  for(depth = 0; ; depth++)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0)
+      return depth-1;
+
+  /* Shouldn't ever happen, as there is always a PU level with higher order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return hwloc_get_nbobjs_by_depth(topology, depth);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return NULL;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_by_depth(topology, depth, idx);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev)
+{
+  if (!prev)
+    return hwloc_get_obj_by_depth (topology, depth, 0);
+  if (prev->depth != depth)
+    return NULL;
+  return prev->next_cousin;
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_by_depth (topology, depth, prev);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology)
+{
+  return hwloc_get_obj_by_depth (topology, 0, 0);
+}
+
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
+{
+  unsigned i;
+  for(i=0; i<obj->infos_count; i++)
+    if (!strcmp(obj->infos[i].name, name))
+      return obj->infos[i].value;
+  return NULL;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+  if (p)
+    return p;
+  hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
+  if (p)
+    return p;
+  hwloc_set_membind(topology, set, policy, flags);
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/intel-mic.h b/ext/hwloc/include/hwloc/intel-mic.h
new file mode 100644
index 0000000..d58237b
--- /dev/null
+++ b/ext/hwloc/include/hwloc/intel-mic.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Intel Xeon Phi (MIC).
+ *
+ * Applications that use both hwloc and Intel Xeon Phi (MIC) may want to
+ * include this file so as to get topology information for MIC devices.
+ */
+
+#ifndef HWLOC_INTEL_MIC_H
+#define HWLOC_INTEL_MIC_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#include <dirent.h>
+#include <string.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_intel_mic Interoperability with Intel Xeon Phi (MIC)
+ *
+ * This interface offers ways to retrieve topology information about
+ * Intel Xeon Phi (MIC) devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to MIC device whose index is \p idx.
+ *
+ * Return the CPU set describing the locality of the MIC device whose index is \p idx.
+ *
+ * Topology \p topology and device index \p idx must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_intel_mic_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+				  int idx __hwloc_attribute_unused,
+				  hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+	/* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX];
+	DIR *sysdir = NULL;
+	FILE *sysfile = NULL;
+	struct dirent *dirent;
+	unsigned pcibus, pcidev, pcifunc;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	sprintf(path, "/sys/class/mic/mic%d", idx);
+	sysdir = opendir(path);
+	if (!sysdir)
+		return -1;
+
+	while ((dirent = readdir(sysdir)) != NULL) {
+		if (sscanf(dirent->d_name, "pci_%02x:%02x.%02x", &pcibus, &pcidev, &pcifunc) == 3) {
+			sprintf(path, "/sys/class/mic/mic%d/pci_%02x:%02x.%02x/local_cpus", idx, pcibus, pcidev, pcifunc);
+			sysfile = fopen(path, "r");
+			if (!sysfile) {
+				closedir(sysdir);
+				return -1;
+			}
+
+			hwloc_linux_parse_cpumap_file(sysfile, set);
+			if (hwloc_bitmap_iszero(set))
+				hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+			fclose(sysfile);
+			break;
+		}
+	}
+
+	closedir(sysdir);
+#else
+	/* Non-Linux systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+	return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * MIC device for the given index.
+ *
+ * Return the OS device object describing the MIC device whose index is \p idx.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_intel_mic_get_device_osdev_by_index(hwloc_topology_t topology,
+					  unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("mic", osdev->name, 3)
+		    && atoi(osdev->name + 3) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INTEL_MIC_H */
diff --git a/ext/hwloc/include/hwloc/linux-libnuma.h b/ext/hwloc/include/hwloc/linux-libnuma.h
new file mode 100644
index 0000000..0ce2591
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux-libnuma.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux libnuma.
+ *
+ * Applications that use both Linux libnuma and hwloc may want to
+ * include this file so as to ease conversion between their respective types.
+*/
+
+#ifndef HWLOC_LINUX_LIBNUMA_H
+#define HWLOC_LINUX_LIBNUMA_H
+
+#include <hwloc.h>
+#include <numa.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux_libnuma_ulongs Interoperability with Linux libnuma unsigned long masks
+ *
+ * This interface helps converting between Linux libnuma unsigned long masks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset,
+				    unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset,
+				      unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    if (!hwloc_bitmap_isset(nodeset, node->os_index))
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc CPU set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+				      const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc NUMA node set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_linux_libnuma_bitmask Interoperability with Linux libnuma bitmask
+ *
+ * This interface helps converting between Linux libnuma bitmasks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL)
+    if (node->memory.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (hwloc_bitmap_isset(nodeset, node->os_index) && node->memory.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc CPU set \p cpuset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+					const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc NUMA node set \p nodeset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					 const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_LINUX_NUMA_H */
diff --git a/ext/hwloc/include/hwloc/linux.h b/ext/hwloc/include/hwloc/linux.h
new file mode 100644
index 0000000..4ddc900
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux.
+ *
+ * Applications that use hwloc on Linux may want to include this file
+ * if using some low-level Linux features.
+ */
+
+#ifndef HWLOC_LINUX_H
+#define HWLOC_LINUX_H
+
+#include <hwloc.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux Linux-specific helpers
+ *
+ * This includes helpers for manipulating Linux kernel cpumap files, and hwloc
+ * equivalents of the Linux sched_setaffinity and sched_getaffinity system calls.
+ *
+ * @{
+ */
+
+/** \brief Convert a linux kernel cpumap file \p file into hwloc CPU set.
+ *
+ * Might be used when reading CPU set from sysfs attributes such as topology
+ * and caches for processors, or local_cpus for devices.
+ */
+HWLOC_DECLSPEC int hwloc_linux_parse_cpumap_file(FILE *file, hwloc_cpuset_t set);
+
+/** \brief Bind a thread \p tid on cpus given in cpuset \p set
+ *
+ * The behavior is exactly the same as the Linux sched_setaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_set_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);
+
+/** \brief Get the current binding of thread \p tid
+ *
+ * The behavior is exactly the same as the Linux sched_getaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
+
+/** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology, pid_t tid, hwloc_bitmap_t set);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/myriexpress.h b/ext/hwloc/include/hwloc/myriexpress.h
new file mode 100644
index 0000000..68ff88f
--- /dev/null
+++ b/ext/hwloc/include/hwloc/myriexpress.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2010-2014 Inria.  All rights reserved.
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Myrinet Express.
+ *
+ * Applications that use both hwloc and Myrinet Express verbs may want to
+ * include this file so as to get topology information for Myrinet hardware.
+ *
+ */
+
+#ifndef HWLOC_MYRIEXPRESS_H
+#define HWLOC_MYRIEXPRESS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+
+#include <myriexpress.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_myriexpress Interoperability with Myrinet Express
+ *
+ * This interface offers ways to retrieve topology information about
+ * Myrinet Express hardware.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX board \p id.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board whose index is \p id.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * No additional information about the device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_board_get_device_cpuset(hwloc_topology_t topology,
+				 unsigned id, hwloc_cpuset_t set)
+{
+  uint32_t in, out;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  in = id;
+  if (mx_get_info(NULL, MX_NUMA_NODE, &in, sizeof(in), &out, sizeof(out)) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (out != (uint32_t) -1) {
+    hwloc_obj_t obj = NULL;
+    while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+      if (obj->os_index == out) {
+	hwloc_bitmap_copy(set, obj->cpuset);
+	goto out;
+      }
+  }
+  /* fallback to the full topology cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ out:
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX endpoint \p endpoint.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board that runs the MX endpoint \p endpoint.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the endpoint.
+ * No additional information about the endpoint or device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_endpoint_get_device_cpuset(hwloc_topology_t topology,
+				    mx_endpoint_t endpoint, hwloc_cpuset_t set)
+{
+  uint64_t nid;
+  uint32_t nindex, eid;
+  mx_endpoint_addr_t eaddr;
+
+  if (mx_get_endpoint_addr(endpoint, &eaddr) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (mx_decompose_endpoint_addr(eaddr, &nid, &eid) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (mx_nic_id_to_board_number(nid, &nindex) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc_mx_board_get_device_cpuset(topology, nindex, set);
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_MYRIEXPRESS_H */
diff --git a/ext/hwloc/include/hwloc/nvml.h b/ext/hwloc/include/hwloc/nvml.h
new file mode 100644
index 0000000..462b332
--- /dev/null
+++ b/ext/hwloc/include/hwloc/nvml.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the NVIDIA Management Library.
+ *
+ * Applications that use both hwloc and the NVIDIA Management Library may want to
+ * include this file so as to get topology information for NVML devices.
+ */
+
+#ifndef HWLOC_NVML_H
+#define HWLOC_NVML_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <nvml.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_nvml Interoperability with the NVIDIA Management Library
+ *
+ * This interface offers ways to retrieve topology information about
+ * devices managed by the NVIDIA Management Library (NVML).
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to NVML device \p device.
+ *
+ * Return the CPU set describing the locality of the NVML device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_nvml_get_device_osdev()
+ * and hwloc_nvml_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     nvmlDevice_t device, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  nvmlReturn_t nvres;
+  nvmlPciInfo_t pci;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  nvres = nvmlDeviceGetPciInfo(device, &pci);
+  if (NVML_SUCCESS != nvres) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * NVML device whose index is \p idx.
+ *
+ * Return the OS device object describing the NVML device whose
+ * index is \p idx. Returns NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("nvml", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to NVML device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * NVML device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_nvml_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
+{
+	hwloc_obj_t osdev;
+	nvmlReturn_t nvres;
+	nvmlPciInfo_t pci;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	nvres = nvmlDeviceGetPciInfo(device, &pci);
+	if (NVML_SUCCESS != nvres)
+		return NULL;
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "nvml", 4))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == pci.domain
+		    && pcidev->attr->pcidev.bus == pci.bus
+		    && pcidev->attr->pcidev.dev == pci.device
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+	}
+
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_NVML_H */
diff --git a/ext/hwloc/include/hwloc/opencl.h b/ext/hwloc/include/hwloc/opencl.h
new file mode 100644
index 0000000..0301ad9
--- /dev/null
+++ b/ext/hwloc/include/hwloc/opencl.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * Copyright © 2013 Université Bordeaux.  All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the OpenCL interface.
+ *
+ * Applications that use both hwloc and OpenCL may want to
+ * include this file so as to get topology information for OpenCL devices.
+ */
+
+#ifndef HWLOC_OPENCL_H
+#define HWLOC_OPENCL_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_opencl Interoperability with OpenCL
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenCL devices.
+ *
+ * Only the AMD OpenCL interface currently offers useful locality information
+ * about its devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to OpenCL device \p device.
+ *
+ * Return the CPU set describing the locality of the OpenCL device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_opencl_get_device_osdev()
+ * and hwloc_opencl_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux with the AMD OpenCL implementation; other systems will simply
+ * get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       cl_device_id device __hwloc_attribute_unused,
+			       hwloc_cpuset_t set)
+{
+#if (defined HWLOC_LINUX_SYS) && (defined CL_DEVICE_TOPOLOGY_AMD)
+	/* If we're on Linux + AMD OpenCL, use the AMD extension + the sysfs mechanism to get the local cpus */
+#define HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX];
+	FILE *sysfile = NULL;
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+
+	sprintf(path, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/local_cpus", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+	sysfile = fopen(path, "r");
+	if (!sysfile)
+		return -1;
+
+	hwloc_linux_parse_cpumap_file(sysfile, set);
+	if (hwloc_bitmap_iszero(set))
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+	fclose(sysfile);
+#else
+	/* Non-Linux + AMD OpenCL systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenCL device for the given indexes.
+ *
+ * Return the OS device object describing the OpenCL device
+ * whose platform index is \p platform_index,
+ * and whose device index within this platform if \p device_index.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
+				       unsigned platform_index, unsigned device_index)
+{
+	unsigned x = (unsigned) -1, y = (unsigned) -1;
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && sscanf(osdev->name, "opencl%ud%u", &x, &y) == 2
+		    && platform_index == x && device_index == y)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to OpenCL device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * OpenCL device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_opencl_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      cl_device_id device __hwloc_attribute_unused)
+{
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+	hwloc_obj_t osdev;
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		errno = EINVAL;
+		return NULL;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "opencl", 6))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == 0
+		    && pcidev->attr->pcidev.bus == amdtopo.pcie.bus
+		    && pcidev->attr->pcidev.dev == amdtopo.pcie.device
+		    && pcidev->attr->pcidev.func == amdtopo.pcie.function)
+			return osdev;
+	}
+
+	return NULL;
+#else
+	return NULL;
+#endif
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENCL_H */
diff --git a/ext/hwloc/include/hwloc/openfabrics-verbs.h b/ext/hwloc/include/hwloc/openfabrics-verbs.h
new file mode 100644
index 0000000..c6b8533
--- /dev/null
+++ b/ext/hwloc/include/hwloc/openfabrics-verbs.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenFabrics
+ * verbs.
+ *
+ * Applications that use both hwloc and OpenFabrics verbs may want to
+ * include this file so as to get topology information for OpenFabrics
+ * hardware.
+ *
+ */
+
+#ifndef HWLOC_OPENFABRICS_VERBS_H
+#define HWLOC_OPENFABRICS_VERBS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <infiniband/verbs.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_openfabrics Interoperability with OpenFabrics
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenFabrics devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p ibdev.
+ *
+ * Return the CPU set describing the locality of the OpenFabrics
+ * device \p ibdev.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_ibv_get_device_osdev()
+ * and hwloc_ibv_get_device_osdev_by_name().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			    struct ibv_device *ibdev, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the verbs-provided sysfs mechanism to
+     get the local cpus */
+#define HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX 128
+  char path[HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/class/infiniband/%s/device/local_cpus",
+	  ibv_get_device_name(ibdev));
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device named \p ibname.
+ *
+ * Return the OS device object describing the OpenFabrics device whose
+ * name is \p ibname. Returns NULL if there is none.
+ * The name \p ibname is usually obtained from ibv_get_device_name().
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev_by_name(hwloc_topology_t topology,
+				   const char *ibname)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_OPENFABRICS == osdev->attr->osdev.type
+		    && osdev->name && !strcmp(ibname, osdev->name))
+			return osdev;
+	}
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device \p ibdev.
+ *
+ * Return the OS device object describing the OpenFabrics device \p ibdev.
+ * Returns NULL if there is none.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_ibv_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev(hwloc_topology_t topology,
+			   struct ibv_device *ibdev)
+{
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+	return hwloc_ibv_get_device_osdev_by_name(topology, ibv_get_device_name(ibdev));
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENFABRICS_VERBS_H */
diff --git a/ext/hwloc/include/hwloc/plugins.h b/ext/hwloc/include/hwloc/plugins.h
new file mode 100644
index 0000000..7fc794d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/plugins.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PLUGINS_H
+#define HWLOC_PLUGINS_H
+
+/** \file
+ * \brief Public interface for building hwloc plugins.
+ */
+
+struct hwloc_backend;
+
+#include <hwloc.h>
+#ifdef HWLOC_INSIDE_PLUGIN
+/* needed for hwloc_plugin_check_namespace() */
+#include <ltdl.h>
+#endif
+
+
+
+/** \defgroup hwlocality_disc_components Components and Plugins: Discovery components
+ * @{
+ */
+
+/** \brief Discovery component type */
+typedef enum hwloc_disc_component_type_e {
+  /** \brief CPU-only discovery through the OS, or generic no-OS support.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_CPU = (1<<0),
+
+  /** \brief xml or synthetic,
+   * platform-specific components such as bgq.
+   * Anything the discovers CPU and everything else.
+   * No misc backend is expected to complement a global component.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL = (1<<1),
+
+  /** \brief OpenCL, Cuda, etc.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_MISC = (1<<2)
+} hwloc_disc_component_type_t;
+
+/** \brief Discovery component structure
+ *
+ * This is the major kind of components, taking care of the discovery.
+ * They are registered by generic components, either statically-built or as plugins.
+ */
+struct hwloc_disc_component {
+  /** \brief Discovery component type */
+  hwloc_disc_component_type_t type;
+
+  /** \brief Name.
+   * If this component is built as a plugin, this name does not have to match the plugin filename.
+   */
+  const char *name;
+
+  /** \brief Component types to exclude, as an OR'ed set of HWLOC_DISC_COMPONENT_TYPE_*.
+   *
+   * For a GLOBAL component, this usually includes all other types (~0).
+   *
+   * Other components only exclude types that may bring conflicting
+   * topology information. MISC components should likely not be excluded
+   * since they usually bring non-primary additional information.
+   */
+  unsigned excludes;
+
+  /** \brief Instantiate callback to create a backend from the component.
+   * Parameters data1, data2, data3 are NULL except for components
+   * that have special enabling routines such as hwloc_topology_set_xml(). */
+  struct hwloc_backend * (*instantiate)(struct hwloc_disc_component *component, const void *data1, const void *data2, const void *data3);
+
+  /** \brief Component priority.
+   * Used to sort topology->components, higher priority first.
+   * Also used to decide between two components with the same name.
+   *
+   * Usual values are
+   * 50 for native OS (or platform) components,
+   * 45 for x86,
+   * 40 for no-OS fallback,
+   * 30 for global components (xml, synthetic),
+   * 20 for pci,
+   * 10 for other misc components (opencl etc.).
+   */
+  unsigned priority;
+
+  /** \private Used internally to list components by priority on topology->components
+   * (the component structure is usually read-only,
+   *  the core copies it before using this field for queueing)
+   */
+  struct hwloc_disc_component * next;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_disc_backends Components and Plugins: Discovery backends
+ * @{
+ */
+
+/** \brief Discovery backend structure
+ *
+ * A backend is the instantiation of a discovery component.
+ * When a component gets enabled for a topology,
+ * its instantiate() callback creates a backend.
+ *
+ * hwloc_backend_alloc() initializes all fields to default values
+ * that the component may change (except "component" and "next")
+ * before enabling the backend with hwloc_backend_enable().
+ */
+struct hwloc_backend {
+  /** \private Reserved for the core, set by hwloc_backend_alloc() */
+  struct hwloc_disc_component * component;
+  /** \private Reserved for the core, set by hwloc_backend_enable() */
+  struct hwloc_topology * topology;
+  /** \private Reserved for the core. Set to 1 if forced through envvar, 0 otherwise. */
+  int envvar_forced;
+  /** \private Reserved for the core. Used internally to list backends topology->backends. */
+  struct hwloc_backend * next;
+
+  /** \brief Backend flags, as an OR'ed set of HWLOC_BACKEND_FLAG_* */
+  unsigned long flags;
+
+  /** \brief Backend-specific 'is_thissystem' property.
+   * Set to 0 or 1 if the backend should enforce the thissystem flag when it gets enabled.
+   * Set to -1 if the backend doesn't care (default). */
+  int is_thissystem;
+
+  /** \brief Backend private data, or NULL if none. */
+  void * private_data;
+  /** \brief Callback for freeing the private_data.
+   * May be NULL.
+   */
+  void (*disable)(struct hwloc_backend *backend);
+
+  /** \brief Main discovery callback.
+   * returns > 0 if it modified the topology tree, -1 on error, 0 otherwise.
+   * May be NULL if type is HWLOC_DISC_COMPONENT_TYPE_MISC. */
+  int (*discover)(struct hwloc_backend *backend);
+
+  /** \brief Callback used by the PCI backend to retrieve the locality of a PCI object from the OS/cpu backend.
+   * May be NULL. */
+  int (*get_obj_cpuset)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+  /** \brief Callback called by backends to notify this backend that a new object was added.
+   * returns > 0 if it modified the topology tree, 0 otherwise.
+   * May be NULL. */
+  int (*notify_new_object)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj);
+};
+
+/** \brief Backend flags */
+enum hwloc_backend_flag_e {
+  /** \brief Levels should be reconnected before this backend discover() is used.
+   * \hideinitializer */
+  HWLOC_BACKEND_FLAG_NEED_LEVELS = (1UL<<0)
+};
+
+/** \brief Allocate a backend structure, set good default values, initialize backend->component and topology, etc.
+ * The caller will then modify whatever needed, and call hwloc_backend_enable().
+ */
+HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_disc_component *component);
+
+/** \brief Enable a previously allocated and setup backend. */
+HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend);
+
+/** \brief Used by backends discovery callbacks to request locality information from others.
+ *
+ * Traverse the list of enabled backends until one has a
+ * get_obj_cpuset() method, and call it.
+ */
+HWLOC_DECLSPEC int hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+/** \brief Used by backends discovery callbacks to notify other
+ * backends of new objects.
+ *
+ * Traverse the list of enabled backends (all but caller) and invoke
+ * their notify_new_object() method to notify them that a new object
+ * just got added to the topology.
+ *
+ * Currently only used for notifying of new PCI device objects.
+ */
+HWLOC_DECLSPEC int hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_generic_components Components and Plugins: Generic components
+ * @{
+ */
+
+/** \brief Generic component type */
+typedef enum hwloc_component_type_e {
+  /** \brief The data field must point to a struct hwloc_disc_component. */
+  HWLOC_COMPONENT_TYPE_DISC,
+
+  /** \brief The data field must point to a struct hwloc_xml_component. */
+  HWLOC_COMPONENT_TYPE_XML
+} hwloc_component_type_t;
+
+/** \brief Generic component structure
+ *
+ * Generic components structure, either statically listed by configure in static-components.h
+ * or dynamically loaded as a plugin.
+ */
+struct hwloc_component {
+  /** \brief Component ABI version, set to HWLOC_COMPONENT_ABI */
+  unsigned abi;
+
+  /** \brief Process-wide component initialization callback.
+   *
+   * This optional callback is called when the component is registered
+   * to the hwloc core (after loading the plugin).
+   *
+   * When the component is built as a plugin, this callback
+   * should call hwloc_check_plugin_namespace()
+   * and return an negative error code on error.
+   *
+   * \p flags is always 0 for now.
+   *
+   * \return 0 on success, or a negative code on error.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  int (*init)(unsigned long flags);
+
+  /** \brief Process-wide component termination callback.
+   *
+   * This optional callback is called after unregistering the component
+   * from the hwloc core (before unloading the plugin).
+   *
+   * \p flags is always 0 for now.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  void (*finalize)(unsigned long flags);
+
+  /** \brief Component type */
+  hwloc_component_type_t type;
+
+  /** \brief Component flags, unused for now */
+  unsigned long flags;
+
+  /** \brief Component data, pointing to a struct hwloc_disc_component or struct hwloc_xml_component. */
+  void * data;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_core_funcs Components and Plugins: Core functions to be used by components
+ * @{
+ */
+
+/** \brief Add an object to the topology.
+ *
+ * It is sorted along the tree of other objects according to the inclusion of
+ * cpusets, to eventually be added as a child of the smallest object including
+ * this object.
+ *
+ * If the cpuset is empty, the type of the object (and maybe some attributes)
+ * must be enough to find where to insert the object. This is especially true
+ * for NUMA nodes with memory and no CPUs.
+ *
+ * The given object should not have children.
+ *
+ * This shall only be called before levels are built.
+ *
+ * In case of error, hwloc_report_os_error() is called.
+ *
+ * Returns the object on success.
+ * Returns NULL and frees obj on error.
+ * Returns another object and frees obj if it was merged with an identical pre-existing object.
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj);
+
+/** \brief Type of error callbacks during object insertion */
+typedef void (*hwloc_report_error_t)(const char * msg, int line);
+/** \brief Report an insertion error from a backend */
+HWLOC_DECLSPEC void hwloc_report_os_error(const char * msg, int line);
+/** \brief Check whether insertion errors are hidden */
+HWLOC_DECLSPEC int hwloc_hide_errors(void);
+
+/** \brief Add an object to the topology and specify which error callback to use.
+ *
+ * Aside from the error callback selection, this function is identical to hwloc_insert_object_by_cpuset()
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj, hwloc_report_error_t report_error);
+
+/** \brief Insert an object somewhere in the topology.
+ *
+ * It is added as the last child of the given parent.
+ * The cpuset is completely ignored, so strange objects such as I/O devices should
+ * preferably be inserted with this.
+ *
+ * When used for "normal" children with cpusets (when importing from XML
+ * when duplicating a topology), the caller should make sure that:
+ * - children are inserted in order,
+ * - children cpusets do not intersect.
+ *
+ * The given object may have normal, I/O or Misc children, as long as they are in order as well.
+ * These children must have valid parent and next_sibling pointers.
+ */
+HWLOC_DECLSPEC void hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj);
+
+/** \brief Allocate and initialize an object of the given type and physical index */
+static __hwloc_inline struct hwloc_obj *
+hwloc_alloc_setup_object(hwloc_obj_type_t type, signed os_index)
+{
+  struct hwloc_obj *obj = malloc(sizeof(*obj));
+  memset(obj, 0, sizeof(*obj));
+  obj->type = type;
+  obj->os_index = os_index;
+  obj->attr = malloc(sizeof(*obj->attr));
+  memset(obj->attr, 0, sizeof(*obj->attr));
+  /* do not allocate the cpuset here, let the caller do it */
+  return obj;
+}
+
+/** \brief Setup object cpusets/nodesets by OR'ing its children.
+ *
+ * Used when adding an object late in the topology.
+ * Will update the new object by OR'ing all its new children sets.
+ *
+ * Used when PCI backend adds a hostbridge parent, when distances
+ * add a new Group, etc.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_children_sets(hwloc_obj_t obj);
+
+/** \brief Make sure that plugins can lookup core symbols.
+ *
+ * This is a sanity check to avoid lazy-lookup failures when libhwloc
+ * is loaded within a plugin, and later tries to load its own plugins.
+ * This may fail (and abort the program) if libhwloc symbols are in a
+ * private namespace.
+ *
+ * \return 0 on success.
+ * \return -1 if the plugin cannot be successfully loaded. The caller
+ * plugin init() callback should return a negative error code as well.
+ *
+ * Plugins should call this function in their init() callback to avoid
+ * later crashes if lazy symbol resolution is used by the upper layer that
+ * loaded hwloc (e.g. OpenCL implementations using dlopen with RTLD_LAZY).
+ *
+ * \note The build system must define HWLOC_INSIDE_PLUGIN if and only if
+ * building the caller as a plugin.
+ *
+ * \note This function should remain inline so plugins can call it even
+ * when they cannot find libhwloc symbols.
+ */
+static __hwloc_inline int
+hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, const char *symbol __hwloc_attribute_unused)
+{
+#ifdef HWLOC_INSIDE_PLUGIN
+  lt_dlhandle handle;
+  void *sym;
+  handle = lt_dlopen(NULL);
+  if (!handle)
+    /* cannot check, assume things will work */
+    return 0;
+  sym = lt_dlsym(handle, symbol);
+  lt_dlclose(handle);
+  if (!sym) {
+    static int verboseenv_checked = 0;
+    static int verboseenv_value = 0;
+    if (!verboseenv_checked) {
+      const char *verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+      verboseenv_value = atoi(verboseenv);
+      verboseenv_checked = 1;
+    }
+    if (verboseenv_value)
+      fprintf(stderr, "Plugin `%s' disabling itself because it cannot find the `%s' core symbol.\n",
+	      pluginname, symbol);
+    return -1;
+  }
+#endif /* HWLOC_INSIDE_PLUGIN */
+  return 0;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pci_funcs Components and Plugins: PCI functions to be used by components
+ * @{
+ */
+
+/** \brief Insert a list of PCI devices and bridges in the backend topology.
+ *
+ * Insert a list of objects (either PCI device or bridges) starting at first_obj
+ * (linked by next_sibling in the topology, and ending with NULL).
+ * Objects are placed under the right bridges, and the remaining upstream bridges
+ * are then inserted in the topology by calling the get_obj_cpuset() callback to
+ * find their locality.
+ */
+HWLOC_DECLSPEC int hwloc_insert_pci_device_list(struct hwloc_backend *backend, struct hwloc_obj *first_obj);
+
+/** \brief Return the offset of the given capability in the PCI config space buffer
+ *
+ * This function requires a 256-bytes config space. Unknown/unavailable bytes should be set to 0xff.
+ */
+HWLOC_DECLSPEC unsigned hwloc_pci_find_cap(const unsigned char *config, unsigned cap);
+
+/** \brief Fill linkspeed by reading the PCI config space where PCI_CAP_ID_EXP is at position offset.
+ *
+ * Needs 20 bytes of EXP capability block starting at offset in the config space
+ * for registers up to link status.
+ */
+HWLOC_DECLSPEC int hwloc_pci_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+
+/** \brief Modify the PCI device object into a bridge and fill its attribute if a bridge is found in the PCI config space.
+ *
+ * This function requires 64 bytes of common configuration header at the beginning of config.
+ */
+HWLOC_DECLSPEC int hwloc_pci_prepare_bridge(hwloc_obj_t obj, const unsigned char *config);
+
+/** @} */
+
+
+
+
+#endif /* HWLOC_PLUGINS_H */
diff --git a/ext/hwloc/include/hwloc/rename.h b/ext/hwloc/include/hwloc/rename.h
new file mode 100644
index 0000000..2684e71
--- /dev/null
+++ b/ext/hwloc/include/hwloc/rename.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_RENAME_H
+#define HWLOC_RENAME_H
+
+#include <hwloc/autogen/config.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Only enact these defines if we're actually renaming the symbols
+   (i.e., avoid trying to have no-op defines if we're *not*
+   renaming). */
+
+#if HWLOC_SYM_TRANSFORM
+
+/* Use a preprocessor two-step in order to get the prefixing right.
+   Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming
+   things. */
+
+#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b)
+#define HWLOC_MUNGE_NAME2(a, b) a ## b
+#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name)
+#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name)
+
+/* Now define all the "real" names to be the prefixed names.  This
+   allows us to use the real names throughout the code base (i.e.,
+   "hwloc_<foo>"); the preprocessor will adjust to have the prefixed
+   name under the covers. */
+
+/* Names from hwloc.h */
+
+#define hwloc_get_api_version HWLOC_NAME(get_api_version)
+
+#define hwloc_topology HWLOC_NAME(topology)
+#define hwloc_topology_t HWLOC_NAME(topology_t)
+
+#define hwloc_cpuset_t HWLOC_NAME(cpuset_t)
+#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t)
+#define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
+#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
+
+#define HWLOC_OBJ_SYSTEM HWLOC_NAME_CAPS(OBJ_SYSTEM)
+#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
+#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
+#define HWLOC_OBJ_CACHE HWLOC_NAME_CAPS(OBJ_CACHE)
+#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
+#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
+#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
+#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
+#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE)
+#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE)
+#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX)
+#define hwloc_obj_type_t HWLOC_NAME(obj_type_t)
+
+#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e)
+#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t)
+#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED)
+#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA)
+#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION)
+
+#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e)
+#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t)
+#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST)
+#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI)
+
+#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e)
+#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t)
+#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK)
+#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU)
+#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK)
+#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS)
+#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA)
+#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC)
+
+#define hwloc_compare_types HWLOC_NAME(compare_types)
+
+#define hwloc_compare_types_e HWLOC_NAME(compare_types_e)
+#define HWLOC_TYPE_UNORDERED HWLOC_NAME_CAPS(TYPE_UNORDERED)
+
+#define hwloc_obj_memory_s HWLOC_NAME(obj_memory_s)
+#define hwloc_obj_memory_page_type_s HWLOC_NAME(obj_memory_page_type_s)
+
+#define hwloc_obj HWLOC_NAME(obj)
+#define hwloc_obj_t HWLOC_NAME(obj_t)
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+#define hwloc_obj_info_s HWLOC_NAME(obj_info_s)
+
+#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
+#define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
+#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
+#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s)
+#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s)
+
+#define hwloc_topology_init HWLOC_NAME(topology_init)
+#define hwloc_topology_load HWLOC_NAME(topology_load)
+#define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
+#define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_check HWLOC_NAME(topology_check)
+
+#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
+
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_SYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IO_DEVICES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_DEVICES)
+#define HWLOC_TOPOLOGY_FLAG_IO_BRIDGES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_BRIDGES)
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_IO HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_IO)
+#define HWLOC_TOPOLOGY_FLAG_ICACHES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_ICACHES)
+
+#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
+#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
+#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
+#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+
+#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
+#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
+#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags)
+#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support)
+#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support)
+#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
+#define hwloc_topology_support HWLOC_NAME(topology_support)
+#define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
+#define hwloc_topology_ignore_type HWLOC_NAME(topology_ignore_type)
+#define hwloc_topology_ignore_type_keep_structure HWLOC_NAME(topology_ignore_type_keep_structure)
+#define hwloc_topology_ignore_all_keep_structure HWLOC_NAME(topology_ignore_all_keep_structure)
+#define hwloc_topology_set_distance_matrix HWLOC_NAME(topology_set_distance_matrix)
+#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
+#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
+
+#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
+#define HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_DISTANCES)
+#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
+#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
+#define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
+
+#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
+#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
+#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
+
+#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
+#define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+
+#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
+#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
+#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE)
+#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE)
+#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
+#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
+#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+
+#define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
+#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
+#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type)
+
+#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth )
+#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
+
+#define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
+#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
+#define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
+
+#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS)
+#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD)
+#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT)
+#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND)
+
+#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t)
+
+#define hwloc_set_cpubind HWLOC_NAME(set_cpubind)
+#define hwloc_get_cpubind HWLOC_NAME(get_cpubind)
+#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind)
+#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind)
+#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind)
+#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind)
+
+#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location)
+#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location)
+
+#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT)
+#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
+#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
+#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
+#define HWLOC_MEMBIND_REPLICATE HWLOC_NAME_CAPS(MEMBIND_REPLICATE)
+#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
+#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
+
+#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t)
+
+#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS)
+#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD)
+#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
+#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
+#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+
+#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_set_membind HWLOC_NAME(set_membind)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_get_membind HWLOC_NAME(get_membind)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+#define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
+#define hwloc_alloc HWLOC_NAME(alloc)
+#define hwloc_free HWLOC_NAME(free)
+
+#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj)
+#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev)
+#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid)
+#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring)
+#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
+#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
+#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
+#define hwloc_get_hostbridge_by_pcibus HWLOC_NAME(get_hostbridge_by_pcibus)
+
+/* hwloc/bitmap.h */
+
+#define hwloc_bitmap_s HWLOC_NAME(bitmap_s)
+#define hwloc_bitmap_t HWLOC_NAME(bitmap_t)
+#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t)
+
+#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc)
+#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full)
+#define hwloc_bitmap_free HWLOC_NAME(bitmap_free)
+#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup)
+#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy)
+#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf)
+#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf)
+#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf)
+#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf)
+#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf)
+#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf)
+#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf)
+#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf)
+#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf)
+#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
+#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
+#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
+
+#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
+#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
+#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
+#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
+#define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
+#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range)
+#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong)
+#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr)
+#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range)
+#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset)
+#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero)
+#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull)
+#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal)
+#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects)
+#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded)
+#define hwloc_bitmap_or HWLOC_NAME(bitmap_or)
+#define hwloc_bitmap_and HWLOC_NAME(bitmap_and)
+#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot)
+#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor)
+#define hwloc_bitmap_not HWLOC_NAME(bitmap_not)
+#define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
+#define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
+#define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
+#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
+#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
+#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight)
+
+/* hwloc/helper.h */
+
+#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth)
+#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth)
+#define hwloc_get_root_obj HWLOC_NAME(get_root_obj)
+#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth)
+#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type)
+#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth)
+#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type)
+#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index)
+#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index)
+#define hwloc_get_next_child HWLOC_NAME(get_next_child)
+#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj)
+#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree)
+#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset)
+#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset)
+#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth)
+#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type)
+#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth)
+#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type)
+#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth)
+#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type)
+#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset)
+#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset)
+#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
+#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
+#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
+#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
+#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
+#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs)
+#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type)
+#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type)
+#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e)
+#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE)
+#define hwloc_distrib HWLOC_NAME(distrib)
+#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy)
+#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset)
+#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset)
+#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset)
+#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset)
+#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset)
+#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
+#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
+#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
+#define hwloc_get_whole_distance_matrix_by_depth HWLOC_NAME(get_whole_distance_matrix_by_depth)
+#define hwloc_get_whole_distance_matrix_by_type HWLOC_NAME(get_whole_distance_matrix_by_type)
+#define hwloc_get_distance_matrix_covering_obj_by_depth HWLOC_NAME(get_distance_matrix_covering_obj_by_depth)
+#define hwloc_get_latency HWLOC_NAME(get_latency)
+
+/* export.h */
+
+#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
+#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
+#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
+#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback)
+#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata)
+#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64)
+#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback)
+
+#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
+
+/* diff.h */
+
+#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
+#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO)
+#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u)
+#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s)
+#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s)
+#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s)
+#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e)
+#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR)
+#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX)
+#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u)
+#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t)
+#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s)
+#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s)
+#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s)
+#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build)
+#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e)
+#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE)
+#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply)
+#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy)
+#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml)
+#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml)
+#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
+#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
+
+/* glibc-sched.h */
+
+#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
+#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity)
+
+/* linux-libnuma.h */
+
+#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs)
+#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs)
+#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs)
+#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs)
+#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask)
+#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask)
+#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask)
+#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask)
+
+/* linux.h */
+
+#define hwloc_linux_parse_cpumap_file HWLOC_NAME(linux_parse_cpumap_file)
+#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
+#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
+#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+
+/* openfabrics-verbs.h */
+
+#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset)
+#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
+#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
+
+/* myriexpress.h */
+
+#define hwloc_mx_board_get_device_cpuset HWLOC_NAME(mx_board_get_device_cpuset)
+#define hwloc_mx_endpoint_get_device_cpuset HWLOC_NAME(mx_endpoint_get_device_cpuset)
+
+/* intel-mic.h */
+
+#define hwloc_intel_mic_get_device_cpuset HWLOC_NAME(intel_mic_get_device_cpuset)
+#define hwloc_intel_mic_get_device_osdev_by_index HWLOC_NAME(intel_mic_get_device_osdev_by_index)
+
+/* opencl.h */
+
+#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
+#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
+#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
+
+/* cuda.h */
+
+#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids)
+#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset)
+#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev)
+#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev)
+#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index)
+
+/* cudart.h */
+
+#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids)
+#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset)
+#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev)
+#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index)
+
+/* nvml.h */
+
+#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset)
+#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev)
+#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index)
+
+/* gl.h */
+
+#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device)
+#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name)
+#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev)
+
+/* hwloc/plugins.h */
+
+#define hwloc_disc_component_type_e HWLOC_NAME(disc_component_type_e)
+#define HWLOC_DISC_COMPONENT_TYPE_CPU HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_CPU)
+#define HWLOC_DISC_COMPONENT_TYPE_GLOBAL HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_GLOBAL)
+#define HWLOC_DISC_COMPONENT_TYPE_MISC HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_MISC)
+#define hwloc_disc_component_type_t HWLOC_NAME(disc_component_type_t)
+#define hwloc_disc_component HWLOC_NAME(disc_component)
+
+#define hwloc_backend HWLOC_NAME(backend)
+#define hwloc_backend_flag_e HWLOC_NAME(backend_flag_e)
+#define HWLOC_BACKEND_FLAG_NEED_LEVELS HWLOC_NAME_CAPS(BACKEND_FLAG_NEED_LEVELS)
+
+#define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
+#define hwloc_backend_enable HWLOC_NAME(backend_enable)
+#define hwloc_backends_get_obj_cpuset HWLOC_NAME(backends_get_obj_cpuset)
+#define hwloc_backends_notify_new_object HWLOC_NAME(backends_notify_new_object)
+
+#define hwloc_component_type_e HWLOC_NAME(component_type_e)
+#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
+#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML)
+#define hwloc_component_type_t HWLOC_NAME(component_type_t)
+#define hwloc_component HWLOC_NAME(component)
+
+#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace)
+
+#define hwloc_insert_object_by_cpuset HWLOC_NAME(insert_object_by_cpuset)
+#define hwloc_report_error_t HWLOC_NAME(report_error_t)
+#define hwloc_report_os_error HWLOC_NAME(report_os_error)
+#define hwloc_hide_errors HWLOC_NAME(hide_errors)
+#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset)
+#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
+#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
+#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+
+#define hwloc_insert_pci_device_list HWLOC_NAME(insert_pci_device_list)
+#define hwloc_pci_find_cap HWLOC_NAME(pci_find_cap)
+#define hwloc_pci_find_linkspeed HWLOC_NAME(pci_find_linkspeed)
+#define hwloc_pci_prepare_bridge HWLOC_NAME(pci_prepare_bridge)
+
+/* hwloc/deprecated.h */
+
+#define hwloc_obj_type_of_string HWLOC_NAME(obj_type_of_string )
+#define hwloc_obj_snprintf HWLOC_NAME(obj_snprintf)
+#define hwloc_distributev HWLOC_NAME(distributev)
+#define hwloc_distribute HWLOC_NAME(distribute)
+#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+
+/* private/debug.h */
+
+#define hwloc_debug HWLOC_NAME(debug)
+
+/* private/misc.h */
+
+#define hwloc_snprintf HWLOC_NAME(snprintf)
+#define hwloc_namecoloncmp HWLOC_NAME(namecoloncmp)
+#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
+#define hwloc_ffs32 HWLOC_NAME(ffs32)
+#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
+#define hwloc_flsl_manual HWLOC_NAME(flsl_manual)
+#define hwloc_fls32 HWLOC_NAME(fls32)
+#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32)
+#define hwloc_weight_long HWLOC_NAME(weight_long)
+#define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
+
+/* private/cpuid-x86.h */
+
+#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
+#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid)
+
+/* private/xml.h */
+
+#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose)
+
+#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s)
+#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t)
+#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff)
+#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
+#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
+#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
+#define hwloc__xml_export_object HWLOC_NAME(_xml_export_object)
+#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
+
+#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
+#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
+
+/* private/components.h */
+
+#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
+#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
+
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+
+#define hwloc_components_init HWLOC_NAME(components_init)
+#define hwloc_components_destroy_all HWLOC_NAME(components_destroy_all)
+
+/* private/private.h */
+
+#define hwloc_ignore_type_e HWLOC_NAME(ignore_type_e)
+
+#define HWLOC_IGNORE_TYPE_NEVER HWLOC_NAME_CAPS(IGNORE_TYPE_NEVER)
+#define HWLOC_IGNORE_TYPE_KEEP_STRUCTURE HWLOC_NAME_CAPS(IGNORE_TYPE_KEEP_STRUCTURE)
+#define HWLOC_IGNORE_TYPE_ALWAYS HWLOC_NAME_CAPS(IGNORE_TYPE_ALWAYS)
+
+#define hwloc_os_distances_s HWLOC_NAME(os_distances_s)
+
+#define hwloc_xml_imported_distances_s HWLOC_NAME(xml_imported_distances_s)
+
+#define hwloc_alloc_obj_cpusets HWLOC_NAME(alloc_obj_cpusets)
+#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
+#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
+#define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
+#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
+#define hwloc_connect_children HWLOC_NAME(connect_children)
+#define hwloc_connect_levels HWLOC_NAME(connect_levels)
+
+#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
+#define hwloc__reorder_children HWLOC_NAME(_reorder_children)
+
+#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
+#define hwloc_topology_clear HWLOC_NAME(topology_clear)
+
+#define hwloc__add_info HWLOC_NAME(_add_info)
+#define hwloc__find_info_slot HWLOC_NAME(_find_info_slot)
+#define hwloc__move_infos HWLOC_NAME(_move_infos)
+#define hwloc__free_infos HWLOC_NAME(_free_infos)
+
+#define hwloc_binding_hooks HWLOC_NAME(binding_hooks)
+#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks)
+#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks)
+
+#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks)
+#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
+#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
+#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
+#define hwloc_set_osf_hooks HWLOC_NAME(set_osf_hooks)
+#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
+#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
+#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
+#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
+#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
+
+#define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
+#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
+#define hwloc__duplicate_objects HWLOC_NAME(_duplicate_objects)
+
+#define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
+#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
+#define hwloc_free_heap HWLOC_NAME(free_heap)
+#define hwloc_free_mmap HWLOC_NAME(free_mmap)
+#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
+
+#define hwloc_distances_init HWLOC_NAME(distances_init)
+#define hwloc_distances_destroy HWLOC_NAME(distances_destroy)
+#define hwloc_distances_set HWLOC_NAME(distances_set)
+#define hwloc_distances_set_from_env HWLOC_NAME(distances_set_from_env)
+#define hwloc_distances_restrict_os HWLOC_NAME(distances_restrict_os)
+#define hwloc_distances_restrict HWLOC_NAME(distances_restrict)
+#define hwloc_distances_finalize_os HWLOC_NAME(distances_finalize_os)
+#define hwloc_distances_finalize_logical HWLOC_NAME(distances_finalize_logical)
+#define hwloc_clear_object_distances HWLOC_NAME(clear_object_distances)
+#define hwloc_clear_object_distances_one HWLOC_NAME(clear_object_distances_one)
+#define hwloc_group_by_distances HWLOC_NAME(group_by_distances)
+
+#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
+#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
+
+#define hwloc_obj_add_info_nodup HWLOC_NAME(obj_add_info_nodup)
+
+#define hwloc_progname HWLOC_NAME(progname)
+
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+/* private/solaris-chiptype.h */
+
+#define hwloc_solaris_get_chip_type HWLOC_NAME(solaris_get_chip_type)
+#define hwloc_solaris_get_chip_model HWLOC_NAME(solaris_get_chip_model)
+
+#endif /* HWLOC_SYM_TRANSFORM */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_RENAME_H */
diff --git a/ext/hwloc/include/numa.h b/ext/hwloc/include/numa.h
new file mode 100644
index 0000000..1dbc137
--- /dev/null
+++ b/ext/hwloc/include/numa.h
@@ -0,0 +1,468 @@
+/* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs.
+
+   libnuma is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; version
+   2.1.
+
+   libnuma is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should find a copy of v2.1 of the GNU Lesser General Public License
+   somewhere on your Linux system; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _NUMA_H
+#define _NUMA_H 1
+
+/* allow an application to test for the current programming interface: */
+#define LIBNUMA_API_VERSION 2
+
+/* Simple NUMA policy library */
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES  128
+#else
+#define NUMA_NUM_NODES  2048
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+        unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
+} nodemask_t;
+
+struct bitmask {
+	unsigned long size; /* number of bits in the map */
+	unsigned long *maskp;
+};
+
+/* operations on struct bitmask */
+int numa_bitmask_isbitset(const struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_setall(struct bitmask *);
+struct bitmask *numa_bitmask_clearall(struct bitmask *);
+struct bitmask *numa_bitmask_setbit(struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_clearbit(struct bitmask *, unsigned int);
+unsigned int numa_bitmask_nbytes(struct bitmask *);
+struct bitmask *numa_bitmask_alloc(unsigned int);
+void numa_bitmask_free(struct bitmask *);
+int numa_bitmask_equal(const struct bitmask *, const struct bitmask *);
+void copy_nodemask_to_bitmask(nodemask_t *, struct bitmask *);
+void copy_bitmask_to_nodemask(struct bitmask *, nodemask_t *);
+void copy_bitmask_to_bitmask(struct bitmask *, struct bitmask *);
+
+/* compatibility for codes that used them: */
+
+static inline void nodemask_zero(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_zero_compat(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_set_compat(nodemask_t *mask, int node)
+{
+	mask->n[node / (8*sizeof(unsigned long))] |=
+		(1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline void nodemask_clr_compat(nodemask_t *mask, int node)
+{
+	mask->n[node / (8*sizeof(unsigned long))] &=
+		~(1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline int nodemask_isset_compat(const nodemask_t *mask, int node)
+{
+	if ((unsigned)node >= NUMA_NUM_NODES)
+		return 0;
+	if (mask->n[node / (8*sizeof(unsigned long))] &
+		(1UL<<(node%(8*sizeof(unsigned long)))))
+		return 1;
+	return 0;
+}
+
+static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b)
+{
+	struct bitmask tmp_a, tmp_b;
+
+	tmp_a.maskp = (unsigned long *)a;
+	tmp_a.size = sizeof(nodemask_t) * 8;
+
+	tmp_b.maskp = (unsigned long *)b;
+	tmp_b.size = sizeof(nodemask_t) * 8;
+
+	return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+static inline int nodemask_equal_compat(const nodemask_t *a, const nodemask_t *b)
+{
+	struct bitmask tmp_a, tmp_b;
+
+	tmp_a.maskp = (unsigned long *)a;
+	tmp_a.size = sizeof(nodemask_t) * 8;
+
+	tmp_b.maskp = (unsigned long *)b;
+	tmp_b.size = sizeof(nodemask_t) * 8;
+
+	return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+/* NUMA support available. If this returns a negative value all other function
+   in this library are undefined. */
+int numa_available(void);
+
+/* Basic NUMA state */
+
+/* Get max available node */
+int numa_max_node(void);
+int numa_max_possible_node(void);
+/* Return preferred node */
+int numa_preferred(void);
+
+/* Return node size and free memory */
+long long numa_node_size64(int node, long long *freep);
+long numa_node_size(int node, long *freep);
+
+int numa_pagesize(void);
+
+/* Set with all nodes from which the calling process may allocate memory.
+   Only valid after numa_available. */
+extern struct bitmask *numa_all_nodes_ptr;
+
+/* Set with all nodes the kernel has exposed to userspace */
+extern struct bitmask *numa_nodes_ptr;
+
+/* For source compatibility */
+extern nodemask_t numa_all_nodes;
+
+/* Set with all cpus. */
+extern struct bitmask *numa_all_cpus_ptr;
+
+/* Set with no nodes */
+extern struct bitmask *numa_no_nodes_ptr;
+
+/* Source compatibility */
+extern nodemask_t numa_no_nodes;
+
+/* Only run and allocate memory from a specific set of nodes. */
+void numa_bind(struct bitmask *nodes);
+
+/* Set the NUMA node interleaving mask. 0 to turn off interleaving */
+void numa_set_interleave_mask(struct bitmask *nodemask);
+
+/* Return the current interleaving mask */
+struct bitmask *numa_get_interleave_mask(void);
+
+/* allocate a bitmask big enough for all nodes */
+struct bitmask *numa_allocate_nodemask(void);
+
+static inline void numa_free_nodemask(struct bitmask *b)
+{
+	numa_bitmask_free(b);
+}
+
+/* Some node to preferably allocate memory from for task. */
+void numa_set_preferred(int node);
+
+/* Set local memory allocation policy for task */
+void numa_set_localalloc(void);
+
+/* Only allocate memory from the nodes set in mask. 0 to turn off */
+void numa_set_membind(struct bitmask *nodemask);
+
+/* Return current membind */
+struct bitmask *numa_get_membind(void);
+
+/* Return allowed memories [nodes] */
+struct bitmask *numa_get_mems_allowed(void);
+
+int numa_get_interleave_node(void);
+
+/* NUMA memory allocation. These functions always round to page size
+   and are relatively slow. */
+
+/* Alloc memory page interleaved on nodes in mask */
+void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask);
+/* Alloc memory page interleaved on all nodes. */
+void *numa_alloc_interleaved(size_t size);
+/* Alloc memory located on node */
+void *numa_alloc_onnode(size_t size, int node);
+/* Alloc memory on local node */
+void *numa_alloc_local(size_t size);
+/* Allocation with current policy */
+void *numa_alloc(size_t size);
+/* Change the size of a memory area preserving the memory policy */
+void *numa_realloc(void *old_addr, size_t old_size, size_t new_size);
+/* Free memory allocated by the functions above */
+void numa_free(void *mem, size_t size);
+
+/* Low level functions, primarily for shared memory. All memory
+   processed by these must not be touched yet */
+
+/* Interleave an memory area. */
+void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on a specific node. */
+void numa_tonode_memory(void *start, size_t size, int node);
+
+/* Allocate memory on a mask of nodes. */
+void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on the current node. */
+void numa_setlocal_memory(void *start, size_t size);
+
+/* Allocate memory area with current memory policy */
+void numa_police_memory(void *start, size_t size);
+
+/* Run current task only on nodes in mask */
+int numa_run_on_node_mask(struct bitmask *mask);
+/* Run current task only on node */
+int numa_run_on_node(int node);
+/* Return current mask of nodes the task can run on */
+struct bitmask * numa_get_run_node_mask(void);
+
+/* When strict fail allocation when memory cannot be allocated in target node(s). */
+void numa_set_bind_policy(int strict);
+
+/* Fail when existing memory has incompatible policy */
+void numa_set_strict(int flag);
+
+/* maximum nodes (size of kernel nodemask_t) */
+int numa_num_possible_nodes();
+
+/* maximum cpus (size of kernel cpumask_t) */
+int numa_num_possible_cpus();
+
+/* nodes in the system */
+int numa_num_configured_nodes();
+
+/* maximum cpus */
+int numa_num_configured_cpus();
+
+/* maximum cpus allowed to current task */
+int numa_num_task_cpus();
+int numa_num_thread_cpus(); /* backward compatibility */
+
+/* maximum nodes allowed to current task */
+int numa_num_task_nodes();
+int numa_num_thread_nodes(); /* backward compatibility */
+
+/* allocate a bitmask the size of the kernel cpumask_t */
+struct bitmask *numa_allocate_cpumask();
+
+static inline void numa_free_cpumask(struct bitmask *b)
+{
+	numa_bitmask_free(b);
+}
+
+/* Convert node to CPU mask. -1/errno on failure, otherwise 0. */
+int numa_node_to_cpus(int, struct bitmask *);
+
+/* report the node of the specified cpu. -1/errno on invalid cpu. */
+int numa_node_of_cpu(int cpu);
+
+/* Report distance of node1 from node2. 0 on error.*/
+int numa_distance(int node1, int node2);
+
+/* Error handling. */
+/* This is an internal function in libnuma that can be overwritten by an user
+   program. Default is to print an error to stderr and exit if numa_exit_on_error
+   is true. */
+void numa_error(char *where);
+
+/* When true exit the program when a NUMA system call (except numa_available)
+   fails */
+extern int numa_exit_on_error;
+/* Warning function. Can also be overwritten. Default is to print on stderr
+   once. */
+void numa_warn(int num, char *fmt, ...);
+
+/* When true exit the program on a numa_warn() call */
+extern int numa_exit_on_warn;
+
+int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to);
+
+int numa_move_pages(int pid, unsigned long count, void **pages,
+		const int *nodes, int *status, int flags);
+
+int numa_sched_getaffinity(pid_t, struct bitmask *);
+int numa_sched_setaffinity(pid_t, struct bitmask *);
+
+/* Convert an ascii list of nodes to a bitmask */
+struct bitmask *numa_parse_nodestring(char *);
+
+/* Convert an ascii list of cpu to a bitmask */
+struct bitmask *numa_parse_cpustring(char *);
+
+/*
+ * The following functions are for source code compatibility
+ * with releases prior to version 2.
+ * Such codes should be compiled with NUMA_VERSION1_COMPATIBILITY defined.
+ */
+
+static inline void numa_set_interleave_mask_compat(nodemask_t *nodemask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)nodemask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_set_interleave_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_interleave_mask_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_interleave_mask();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void numa_bind_compat(nodemask_t *mask)
+{
+	struct bitmask *tp;
+
+	tp = numa_allocate_nodemask();
+	copy_nodemask_to_bitmask(mask, tp);
+	numa_bind(tp);
+	numa_bitmask_free(tp);
+}
+
+static inline void numa_set_membind_compat(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_set_membind(&tmp);
+}
+
+static inline nodemask_t numa_get_membind_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_membind();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void *numa_alloc_interleaved_subset_compat(size_t size,
+					const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	return numa_alloc_interleaved_subset(size, &tmp);
+}
+
+static inline int numa_run_on_node_mask_compat(const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	return numa_run_on_node_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_run_node_mask_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_run_node_mask();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void numa_interleave_memory_compat(void *mem, size_t size,
+						const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_interleave_memory(mem, size, &tmp);
+}
+
+static inline void numa_tonodemask_memory_compat(void *mem, size_t size,
+						const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_tonodemask_memory(mem, size, &tmp);
+}
+
+static inline int numa_sched_getaffinity_compat(pid_t pid, unsigned len,
+						unsigned long *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = len * 8;
+	return numa_sched_getaffinity(pid, &tmp);
+}
+
+static inline int numa_sched_setaffinity_compat(pid_t pid, unsigned len,
+						unsigned long *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = len * 8;
+	return numa_sched_setaffinity(pid, &tmp);
+}
+
+static inline int numa_node_to_cpus_compat(int node, unsigned long *buffer,
+							int buffer_len)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)buffer;
+	tmp.size = buffer_len * 8;
+	return numa_node_to_cpus(node, &tmp);
+}
+
+/* end of version 1 compatibility functions */
+
+/*
+ * To compile an application that uses libnuma version 1:
+ *   add -DNUMA_VERSION1_COMPATIBILITY to your Makefile's CFLAGS
+ */
+#ifdef NUMA_VERSION1_COMPATIBILITY
+#include <numacompat1.h>
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/hwloc/include/pci/config.h b/ext/hwloc/include/pci/config.h
new file mode 100644
index 0000000..beecb1d
--- /dev/null
+++ b/ext/hwloc/include/pci/config.h
@@ -0,0 +1,16 @@
+#define PCI_CONFIG_H
+#define PCI_ARCH_X86_64
+#define PCI_OS_LINUX
+#define PCI_HAVE_PM_LINUX_SYSFS
+#define PCI_HAVE_PM_LINUX_PROC
+#define PCI_HAVE_LINUX_BYTEORDER_H
+#define PCI_PATH_PROC_BUS_PCI "/proc/bus/pci"
+#define PCI_PATH_SYS_BUS_PCI "/sys/bus/pci"
+#define PCI_HAVE_PM_INTEL_CONF
+#define PCI_HAVE_64BIT_ADDRESS
+#define PCI_HAVE_PM_DUMP
+#define PCI_COMPRESSED_IDS
+#define PCI_IDS "pci.ids.gz"
+#define PCI_PATH_IDS_DIR "/usr/share/misc"
+#define PCI_USE_DNS
+#define PCI_ID_DOMAIN "pci.id.ucw.cz"
diff --git a/ext/hwloc/include/pci/header.h b/ext/hwloc/include/pci/header.h
new file mode 100644
index 0000000..d481f27
--- /dev/null
+++ b/ext/hwloc/include/pci/header.h
@@ -0,0 +1,1195 @@
+/*
+ *	The PCI Library -- PCI Header Structure (based on <linux/pci.h>)
+ *
+ *	Copyright (c) 1997--2010 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID		0x00	/* 16 bits */
+#define PCI_DEVICE_ID		0x02	/* 16 bits */
+#define PCI_COMMAND		0x04	/* 16 bits */
+#define  PCI_COMMAND_IO		0x1	/* Enable response in I/O space */
+#define  PCI_COMMAND_MEMORY	0x2	/* Enable response in Memory space */
+#define  PCI_COMMAND_MASTER	0x4	/* Enable bus mastering */
+#define  PCI_COMMAND_SPECIAL	0x8	/* Enable response to special cycles */
+#define  PCI_COMMAND_INVALIDATE	0x10	/* Use memory write and invalidate */
+#define  PCI_COMMAND_VGA_PALETTE 0x20	/* Enable palette snooping */
+#define  PCI_COMMAND_PARITY	0x40	/* Enable parity checking */
+#define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
+#define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
+#define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
+#define  PCI_COMMAND_DISABLE_INTx	0x400	/* PCIE: Disable INTx interrupts */
+
+#define PCI_STATUS		0x06	/* 16 bits */
+#define  PCI_STATUS_INTx	0x08	/* PCIE: INTx interrupt pending */
+#define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
+#define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
+#define  PCI_STATUS_FAST_BACK	0x80	/* Accept fast-back to back */
+#define  PCI_STATUS_PARITY	0x100	/* Detected parity error */
+#define  PCI_STATUS_DEVSEL_MASK	0x600	/* DEVSEL timing */
+#define  PCI_STATUS_DEVSEL_FAST	0x000
+#define  PCI_STATUS_DEVSEL_MEDIUM 0x200
+#define  PCI_STATUS_DEVSEL_SLOW 0x400
+#define  PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */
+#define  PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */
+#define  PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */
+#define  PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */
+#define  PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION	0x08	/* High 24 bits are class, low 8
+					   revision */
+#define PCI_REVISION_ID         0x08    /* Revision ID */
+#define PCI_CLASS_PROG          0x09    /* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE        0x0a    /* Device class */
+
+#define PCI_CACHE_LINE_SIZE	0x0c	/* 8 bits */
+#define PCI_LATENCY_TIMER	0x0d	/* 8 bits */
+#define PCI_HEADER_TYPE		0x0e	/* 8 bits */
+#define  PCI_HEADER_TYPE_NORMAL	0
+#define  PCI_HEADER_TYPE_BRIDGE 1
+#define  PCI_HEADER_TYPE_CARDBUS 2
+
+#define PCI_BIST		0x0f	/* 8 bits */
+#define PCI_BIST_CODE_MASK	0x0f	/* Return result */
+#define PCI_BIST_START		0x40	/* 1 to start BIST, 2 secs or less */
+#define PCI_BIST_CAPABLE	0x80	/* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back.  Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0	0x10	/* 32 bits */
+#define PCI_BASE_ADDRESS_1	0x14	/* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2	0x18	/* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3	0x1c	/* 32 bits */
+#define PCI_BASE_ADDRESS_4	0x20	/* 32 bits */
+#define PCI_BASE_ADDRESS_5	0x24	/* 32 bits */
+#define  PCI_BASE_ADDRESS_SPACE	0x01	/* 0 = memory, 1 = I/O */
+#define  PCI_BASE_ADDRESS_SPACE_IO 0x01
+#define  PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
+#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06
+#define  PCI_BASE_ADDRESS_MEM_TYPE_32	0x00	/* 32 bit address */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_1M	0x02	/* Below 1M [obsolete] */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_64	0x04	/* 64 bit address */
+#define  PCI_BASE_ADDRESS_MEM_PREFETCH	0x08	/* prefetchable? */
+#define  PCI_BASE_ADDRESS_MEM_MASK	(~(pciaddr_t)0x0f)
+#define  PCI_BASE_ADDRESS_IO_MASK	(~(pciaddr_t)0x03)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS		0x28
+#define PCI_SUBSYSTEM_VENDOR_ID	0x2c
+#define PCI_SUBSYSTEM_ID	0x2e
+#define PCI_ROM_ADDRESS		0x30	/* Bits 31..11 are address, 10..1 reserved */
+#define  PCI_ROM_ADDRESS_ENABLE	0x01
+#define PCI_ROM_ADDRESS_MASK	(~(pciaddr_t)0x7ff)
+
+#define PCI_CAPABILITY_LIST	0x34	/* Offset of first capability list entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE	0x3c	/* 8 bits */
+#define PCI_INTERRUPT_PIN	0x3d	/* 8 bits */
+#define PCI_MIN_GNT		0x3e	/* 8 bits */
+#define PCI_MAX_LAT		0x3f	/* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS		0x18	/* Primary bus number */
+#define PCI_SECONDARY_BUS	0x19	/* Secondary bus number */
+#define PCI_SUBORDINATE_BUS	0x1a	/* Highest bus number behind the bridge */
+#define PCI_SEC_LATENCY_TIMER	0x1b	/* Latency timer for secondary interface */
+#define PCI_IO_BASE		0x1c	/* I/O range behind the bridge */
+#define PCI_IO_LIMIT		0x1d
+#define  PCI_IO_RANGE_TYPE_MASK	0x0f	/* I/O bridging type */
+#define  PCI_IO_RANGE_TYPE_16	0x00
+#define  PCI_IO_RANGE_TYPE_32	0x01
+#define  PCI_IO_RANGE_MASK	~0x0f
+#define PCI_SEC_STATUS		0x1e	/* Secondary status register */
+#define PCI_MEMORY_BASE		0x20	/* Memory range behind */
+#define PCI_MEMORY_LIMIT	0x22
+#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0f
+#define  PCI_MEMORY_RANGE_MASK	~0x0f
+#define PCI_PREF_MEMORY_BASE	0x24	/* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT	0x26
+#define  PCI_PREF_RANGE_TYPE_MASK 0x0f
+#define  PCI_PREF_RANGE_TYPE_32	0x00
+#define  PCI_PREF_RANGE_TYPE_64	0x01
+#define  PCI_PREF_RANGE_MASK	~0x0f
+#define PCI_PREF_BASE_UPPER32	0x28	/* Upper half of prefetchable memory range */
+#define PCI_PREF_LIMIT_UPPER32	0x2c
+#define PCI_IO_BASE_UPPER16	0x30	/* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16	0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1	0x38	/* Same as PCI_ROM_ADDRESS, but for htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL	0x3e
+#define  PCI_BRIDGE_CTL_PARITY	0x01	/* Enable parity detection on secondary interface */
+#define  PCI_BRIDGE_CTL_SERR	0x02	/* The same for SERR forwarding */
+#define  PCI_BRIDGE_CTL_NO_ISA	0x04	/* Disable bridging of ISA ports */
+#define  PCI_BRIDGE_CTL_VGA	0x08	/* Forward VGA addresses */
+#define  PCI_BRIDGE_CTL_MASTER_ABORT 0x20  /* Report master aborts */
+#define  PCI_BRIDGE_CTL_BUS_RESET 0x40	/* Secondary bus reset */
+#define  PCI_BRIDGE_CTL_FAST_BACK 0x80	/* Fast Back2Back enabled on secondary interface */
+#define  PCI_BRIDGE_CTL_PRI_DISCARD_TIMER 0x100		/* PCI-X? */
+#define  PCI_BRIDGE_CTL_SEC_DISCARD_TIMER 0x200		/* PCI-X? */
+#define  PCI_BRIDGE_CTL_DISCARD_TIMER_STATUS 0x400	/* PCI-X? */
+#define  PCI_BRIDGE_CTL_DISCARD_TIMER_SERR_EN 0x800	/* PCI-X? */
+
+/* Header type 2 (CardBus bridges) */
+/* 0x14-0x15 reserved */
+#define PCI_CB_SEC_STATUS	0x16	/* Secondary status */
+#define PCI_CB_PRIMARY_BUS	0x18	/* PCI bus number */
+#define PCI_CB_CARD_BUS		0x19	/* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS	0x1a	/* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER	0x1b	/* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0	0x1c
+#define PCI_CB_MEMORY_LIMIT_0	0x20
+#define PCI_CB_MEMORY_BASE_1	0x24
+#define PCI_CB_MEMORY_LIMIT_1	0x28
+#define PCI_CB_IO_BASE_0	0x2c
+#define PCI_CB_IO_BASE_0_HI	0x2e
+#define PCI_CB_IO_LIMIT_0	0x30
+#define PCI_CB_IO_LIMIT_0_HI	0x32
+#define PCI_CB_IO_BASE_1	0x34
+#define PCI_CB_IO_BASE_1_HI	0x36
+#define PCI_CB_IO_LIMIT_1	0x38
+#define PCI_CB_IO_LIMIT_1_HI	0x3a
+#define  PCI_CB_IO_RANGE_MASK	~0x03
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL	0x3e
+#define  PCI_CB_BRIDGE_CTL_PARITY	0x01	/* Similar to standard bridge control register */
+#define  PCI_CB_BRIDGE_CTL_SERR		0x02
+#define  PCI_CB_BRIDGE_CTL_ISA		0x04
+#define  PCI_CB_BRIDGE_CTL_VGA		0x08
+#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT	0x20
+#define  PCI_CB_BRIDGE_CTL_CB_RESET	0x40	/* CardBus reset */
+#define  PCI_CB_BRIDGE_CTL_16BIT_INT	0x80	/* Enable interrupt for 16-bit cards */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100	/* Prefetch enable for both memory regions */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define  PCI_CB_BRIDGE_CTL_POST_WRITES	0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40
+#define PCI_CB_SUBSYSTEM_ID	0x42
+#define PCI_CB_LEGACY_MODE_BASE	0x44	/* 16-bit PC Card legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID		0	/* Capability ID */
+#define  PCI_CAP_ID_PM		0x01	/* Power Management */
+#define  PCI_CAP_ID_AGP		0x02	/* Accelerated Graphics Port */
+#define  PCI_CAP_ID_VPD		0x03	/* Vital Product Data */
+#define  PCI_CAP_ID_SLOTID	0x04	/* Slot Identification */
+#define  PCI_CAP_ID_MSI		0x05	/* Message Signaled Interrupts */
+#define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
+#define  PCI_CAP_ID_PCIX        0x07    /* PCI-X */
+#define  PCI_CAP_ID_HT          0x08    /* HyperTransport */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific */
+#define  PCI_CAP_ID_DBG		0x0A	/* Debug port */
+#define  PCI_CAP_ID_CCRC	0x0B	/* CompactPCI Central Resource Control */
+#define  PCI_CAP_ID_HOTPLUG	0x0C	/* PCI hot-plug */
+#define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
+#define  PCI_CAP_ID_AGP3	0x0E	/* AGP 8x */
+#define  PCI_CAP_ID_SECURE	0x0F	/* Secure device (?) */
+#define  PCI_CAP_ID_EXP		0x10	/* PCI Express */
+#define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
+#define  PCI_CAP_ID_SATA	0x12	/* Serial-ATA HBA */
+#define  PCI_CAP_ID_AF		0x13	/* Advanced features of PCI devices integrated in PCIe root cplx */
+#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
+#define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF		4
+
+/* Capabilities residing in the PCI Express extended configuration space */
+
+#define PCI_EXT_CAP_ID_AER	0x01	/* Advanced Error Reporting */
+#define PCI_EXT_CAP_ID_VC	0x02	/* Virtual Channel */
+#define PCI_EXT_CAP_ID_DSN	0x03	/* Device Serial Number */
+#define PCI_EXT_CAP_ID_PB	0x04	/* Power Budgeting */
+#define PCI_EXT_CAP_ID_RCLINK	0x05	/* Root Complex Link Declaration */
+#define PCI_EXT_CAP_ID_RCILINK	0x06	/* Root Complex Internal Link Declaration */
+#define PCI_EXT_CAP_ID_RCECOLL	0x07	/* Root Complex Event Collector */
+#define PCI_EXT_CAP_ID_MFVC	0x08	/* Multi-Function Virtual Channel */
+#define PCI_EXT_CAP_ID_VC2	0x09	/* Virtual Channel (2nd ID) */
+#define PCI_EXT_CAP_ID_RBCB	0x0a	/* Root Bridge Control Block */
+#define PCI_EXT_CAP_ID_VNDR	0x0b	/* Vendor specific */
+#define PCI_EXT_CAP_ID_ACS	0x0d	/* Access Controls */
+#define PCI_EXT_CAP_ID_ARI	0x0e	/* Alternative Routing-ID Interpretation */
+#define PCI_EXT_CAP_ID_ATS	0x0f	/* Address Translation Service */
+#define PCI_EXT_CAP_ID_SRIOV	0x10	/* Single Root I/O Virtualization */
+#define PCI_EXT_CAP_ID_TPH	0x17	/* Transaction processing hints */
+#define PCI_EXT_CAP_ID_LTR	0x18	/* Latency Tolerance Reporting */
+
+/*** Definitions of capabilities ***/
+
+/* Power Management Registers */
+
+#define  PCI_PM_CAP_VER_MASK	0x0007	/* Version (2=PM1.1) */
+#define  PCI_PM_CAP_PME_CLOCK	0x0008	/* Clock required for PME generation */
+#define  PCI_PM_CAP_DSI		0x0020	/* Device specific initialization required */
+#define  PCI_PM_CAP_AUX_C_MASK	0x01c0	/* Maximum aux current required in D3cold */
+#define  PCI_PM_CAP_D1		0x0200	/* D1 power state support */
+#define  PCI_PM_CAP_D2		0x0400	/* D2 power state support */
+#define  PCI_PM_CAP_PME_D0	0x0800	/* PME can be asserted from D0 */
+#define  PCI_PM_CAP_PME_D1	0x1000	/* PME can be asserted from D1 */
+#define  PCI_PM_CAP_PME_D2	0x2000	/* PME can be asserted from D2 */
+#define  PCI_PM_CAP_PME_D3_HOT	0x4000	/* PME can be asserted from D3hot */
+#define  PCI_PM_CAP_PME_D3_COLD	0x8000	/* PME can be asserted from D3cold */
+#define PCI_PM_CTRL		4	/* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RST	0x0008	/* No Soft Reset from D3hot to D0 */
+#define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
+#define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* PM table data index */
+#define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* PM table data scaling factor */
+#define  PCI_PM_CTRL_PME_STATUS	0x8000	/* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS	6	/* PPB support extensions */
+#define  PCI_PM_PPB_B2_B3	0x40	/* If bridge enters D3hot, bus enters: 0=B3, 1=B2 */
+#define  PCI_PM_BPCC_ENABLE	0x80	/* Secondary bus is power managed */
+#define PCI_PM_DATA_REGISTER	7	/* PM table contents read here */
+#define PCI_PM_SIZEOF		8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION		2	/* BCD version number */
+#define PCI_AGP_RFU		3	/* Rest of capability flags */
+#define PCI_AGP_STATUS		4	/* Status register */
+#define  PCI_AGP_STATUS_RQ_MASK	0xff000000	/* Maximum number of requests - 1 */
+#define  PCI_AGP_STATUS_ISOCH	0x10000	/* Isochronous transactions supported */
+#define  PCI_AGP_STATUS_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
+#define  PCI_AGP_STATUS_CAL_MASK	0x1c00	/* Calibration cycle timing */
+#define  PCI_AGP_STATUS_SBA	0x0200	/* Sideband addressing supported */
+#define  PCI_AGP_STATUS_ITA_COH	0x0100	/* In-aperture accesses always coherent */
+#define  PCI_AGP_STATUS_GART64	0x0080	/* 64-bit GART entries supported */
+#define  PCI_AGP_STATUS_HTRANS	0x0040	/* If 0, core logic can xlate host CPU accesses thru aperture */
+#define  PCI_AGP_STATUS_64BIT	0x0020	/* 64-bit addressing cycles supported */
+#define  PCI_AGP_STATUS_FW	0x0010	/* Fast write transfers supported */
+#define  PCI_AGP_STATUS_AGP3	0x0008	/* AGP3 mode supported */
+#define  PCI_AGP_STATUS_RATE4	0x0004	/* 4x transfer rate supported (RFU in AGP3 mode) */
+#define  PCI_AGP_STATUS_RATE2	0x0002	/* 2x transfer rate supported (8x in AGP3 mode) */
+#define  PCI_AGP_STATUS_RATE1	0x0001	/* 1x transfer rate supported (4x in AGP3 mode) */
+#define PCI_AGP_COMMAND		8	/* Control register */
+#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of requests */
+#define  PCI_AGP_COMMAND_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
+#define  PCI_AGP_COMMAND_CAL_MASK	0x1c00	/* Calibration cycle timing */
+#define  PCI_AGP_COMMAND_SBA	0x0200	/* Sideband addressing enabled */
+#define  PCI_AGP_COMMAND_AGP	0x0100	/* Allow processing of AGP transactions */
+#define  PCI_AGP_COMMAND_GART64	0x0080	/* 64-bit GART entries enabled */
+#define  PCI_AGP_COMMAND_64BIT	0x0020 	/* Allow generation of 64-bit addr cycles */
+#define  PCI_AGP_COMMAND_FW	0x0010 	/* Enable FW transfers */
+#define  PCI_AGP_COMMAND_RATE4	0x0004	/* Use 4x rate (RFU in AGP3 mode) */
+#define  PCI_AGP_COMMAND_RATE2	0x0002	/* Use 2x rate (8x in AGP3 mode) */
+#define  PCI_AGP_COMMAND_RATE1	0x0001	/* Use 1x rate (4x in AGP3 mode) */
+#define PCI_AGP_SIZEOF		12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR		2	/* Address to access (15 bits!) */
+#define  PCI_VPD_ADDR_MASK	0x7fff	/* Address mask */
+#define  PCI_VPD_ADDR_F		0x8000	/* Write 0, 1 indicates completion */
+#define PCI_VPD_DATA		4	/* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR		2	/* Expansion Slot Register */
+#define  PCI_SID_ESR_NSLOTS	0x1f	/* Number of expansion slots available */
+#define  PCI_SID_ESR_FIC	0x20	/* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR	3	/* Chassis Number */
+
+/* Message Signaled Interrupts registers */
+
+#define PCI_MSI_FLAGS		2	/* Various flags */
+#define  PCI_MSI_FLAGS_MASK_BIT	0x100	/* interrupt masking & reporting supported */
+#define  PCI_MSI_FLAGS_64BIT	0x080	/* 64-bit addresses allowed */
+#define  PCI_MSI_FLAGS_QSIZE	0x070	/* Message queue size configured */
+#define  PCI_MSI_FLAGS_QMASK	0x00e	/* Maximum queue size available */
+#define  PCI_MSI_FLAGS_ENABLE	0x001	/* MSI feature enabled */
+#define PCI_MSI_RFU		3	/* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
+#define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_BIT_32	12	/* per-vector masking for 32-bit devices */
+#define PCI_MSI_MASK_BIT_64	16	/* per-vector masking for 64-bit devices */
+#define PCI_MSI_PENDING_32	16	/* per-vector interrupt pending for 32-bit devices */
+#define PCI_MSI_PENDING_64	20	/* per-vector interrupt pending for 64-bit devices */
+
+/* PCI-X */
+#define PCI_PCIX_COMMAND                                                2 /* Command register offset */
+#define PCI_PCIX_COMMAND_DPERE                                     0x0001 /* Data Parity Error Recover Enable */
+#define PCI_PCIX_COMMAND_ERO                                       0x0002 /* Enable Relaxed Ordering */
+#define PCI_PCIX_COMMAND_MAX_MEM_READ_BYTE_COUNT                   0x000c /* Maximum Memory Read Byte Count */
+#define PCI_PCIX_COMMAND_MAX_OUTSTANDING_SPLIT_TRANS               0x0070
+#define PCI_PCIX_COMMAND_RESERVED                                   0xf80
+#define PCI_PCIX_STATUS                                                 4 /* Status register offset */
+#define PCI_PCIX_STATUS_FUNCTION                               0x00000007
+#define PCI_PCIX_STATUS_DEVICE                                 0x000000f8
+#define PCI_PCIX_STATUS_BUS                                    0x0000ff00
+#define PCI_PCIX_STATUS_64BIT                                  0x00010000
+#define PCI_PCIX_STATUS_133MHZ                                 0x00020000
+#define PCI_PCIX_STATUS_SC_DISCARDED                           0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_STATUS_UNEXPECTED_SC                          0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_STATUS_DEVICE_COMPLEXITY                      0x00100000 /* 0 = simple device, 1 = bridge device */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_MEM_READ_BYTE_COUNT       0x00600000 /* 0 = 512 bytes, 1 = 1024, 2 = 2048, 3 = 4096 */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_OUTSTANDING_SPLIT_TRANS   0x03800000
+#define PCI_PCIX_STATUS_DESIGNED_MAX_CUMULATIVE_READ_SIZE      0x1c000000
+#define PCI_PCIX_STATUS_RCVD_SC_ERR_MESS                       0x20000000 /* Received Split Completion Error Message */
+#define PCI_PCIX_STATUS_266MHZ				       0x40000000 /* 266 MHz capable */
+#define PCI_PCIX_STATUS_533MHZ				       0x80000000 /* 533 MHz capable */
+#define PCI_PCIX_SIZEOF		4
+
+/* PCI-X Bridges */
+#define PCI_PCIX_BRIDGE_SEC_STATUS                                      2 /* Secondary bus status register offset */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_64BIT                           0x0001
+#define PCI_PCIX_BRIDGE_SEC_STATUS_133MHZ                          0x0002
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_DISCARDED                    0x0004 /* Split Completion Discarded on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_UNEXPECTED_SC                   0x0008 /* Unexpected Split Completion on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_OVERRUN                      0x0010 /* Split Completion Overrun on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SPLIT_REQUEST_DELAYED           0x0020
+#define PCI_PCIX_BRIDGE_SEC_STATUS_CLOCK_FREQ                      0x01c0
+#define PCI_PCIX_BRIDGE_SEC_STATUS_RESERVED                        0xfe00
+#define PCI_PCIX_BRIDGE_STATUS                                          4 /* Primary bus status register offset */
+#define PCI_PCIX_BRIDGE_STATUS_FUNCTION                        0x00000007
+#define PCI_PCIX_BRIDGE_STATUS_DEVICE                          0x000000f8
+#define PCI_PCIX_BRIDGE_STATUS_BUS                             0x0000ff00
+#define PCI_PCIX_BRIDGE_STATUS_64BIT                           0x00010000
+#define PCI_PCIX_BRIDGE_STATUS_133MHZ                          0x00020000
+#define PCI_PCIX_BRIDGE_STATUS_SC_DISCARDED                    0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_BRIDGE_STATUS_UNEXPECTED_SC                   0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_BRIDGE_STATUS_SC_OVERRUN                      0x00100000 /* Split Completion Overrun */
+#define PCI_PCIX_BRIDGE_STATUS_SPLIT_REQUEST_DELAYED           0x00200000
+#define PCI_PCIX_BRIDGE_STATUS_RESERVED                        0xffc00000
+#define PCI_PCIX_BRIDGE_UPSTREAM_SPLIT_TRANS_CTRL                       8 /* Upstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_DOWNSTREAM_SPLIT_TRANS_CTRL                    12 /* Downstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_STR_CAPACITY                           0x0000ffff
+#define PCI_PCIX_BRIDGE_STR_COMMITMENT_LIMIT                   0xffff0000
+#define PCI_PCIX_BRIDGE_SIZEOF 12
+
+/* HyperTransport (as of spec rev. 2.00) */
+#define PCI_HT_CMD		2	/* Command Register */
+#define  PCI_HT_CMD_TYP_HI	0xe000	/* Capability Type high part */
+#define  PCI_HT_CMD_TYP_HI_PRI	0x0000	/* Slave or Primary Interface */
+#define  PCI_HT_CMD_TYP_HI_SEC	0x2000	/* Host or Secondary Interface */
+#define  PCI_HT_CMD_TYP		0xf800	/* Capability Type */
+#define  PCI_HT_CMD_TYP_SW	0x4000	/* Switch */
+#define  PCI_HT_CMD_TYP_IDC	0x8000	/* Interrupt Discovery and Configuration */
+#define  PCI_HT_CMD_TYP_RID	0x8800	/* Revision ID */
+#define  PCI_HT_CMD_TYP_UIDC	0x9000	/* UnitID Clumping */
+#define  PCI_HT_CMD_TYP_ECSA	0x9800	/* Extended Configuration Space Access */
+#define  PCI_HT_CMD_TYP_AM	0xa000	/* Address Mapping */
+#define  PCI_HT_CMD_TYP_MSIM	0xa800	/* MSI Mapping */
+#define  PCI_HT_CMD_TYP_DR	0xb000	/* DirectRoute */
+#define  PCI_HT_CMD_TYP_VCS	0xb800	/* VCSet */
+#define  PCI_HT_CMD_TYP_RM	0xc000	/* Retry Mode */
+#define  PCI_HT_CMD_TYP_X86	0xc800	/* X86 (reserved) */
+
+					/* Link Control Register */
+#define  PCI_HT_LCTR_CFLE	0x0002	/* CRC Flood Enable */
+#define  PCI_HT_LCTR_CST	0x0004	/* CRC Start Test */
+#define  PCI_HT_LCTR_CFE	0x0008	/* CRC Force Error */
+#define  PCI_HT_LCTR_LKFAIL	0x0010	/* Link Failure */
+#define  PCI_HT_LCTR_INIT	0x0020	/* Initialization Complete */
+#define  PCI_HT_LCTR_EOC	0x0040	/* End of Chain */
+#define  PCI_HT_LCTR_TXO	0x0080	/* Transmitter Off */
+#define  PCI_HT_LCTR_CRCERR	0x0f00	/* CRC Error */
+#define  PCI_HT_LCTR_ISOCEN	0x1000	/* Isochronous Flow Control Enable */
+#define  PCI_HT_LCTR_LSEN	0x2000	/* LDTSTOP# Tristate Enable */
+#define  PCI_HT_LCTR_EXTCTL	0x4000	/* Extended CTL Time */
+#define  PCI_HT_LCTR_64B	0x8000	/* 64-bit Addressing Enable */
+
+					/* Link Configuration Register */
+#define  PCI_HT_LCNF_MLWI	0x0007	/* Max Link Width In */
+#define  PCI_HT_LCNF_LW_8B	0x0	/* Link Width 8 bits */
+#define  PCI_HT_LCNF_LW_16B	0x1	/* Link Width 16 bits */
+#define  PCI_HT_LCNF_LW_32B	0x3	/* Link Width 32 bits */
+#define  PCI_HT_LCNF_LW_2B	0x4	/* Link Width 2 bits */
+#define  PCI_HT_LCNF_LW_4B	0x5	/* Link Width 4 bits */
+#define  PCI_HT_LCNF_LW_NC	0x7	/* Link physically not connected */
+#define  PCI_HT_LCNF_DFI	0x0008	/* Doubleword Flow Control In */
+#define  PCI_HT_LCNF_MLWO	0x0070	/* Max Link Width Out */
+#define  PCI_HT_LCNF_DFO	0x0080	/* Doubleword Flow Control Out */
+#define  PCI_HT_LCNF_LWI	0x0700	/* Link Width In */
+#define  PCI_HT_LCNF_DFIE	0x0800	/* Doubleword Flow Control In Enable */
+#define  PCI_HT_LCNF_LWO	0x7000	/* Link Width Out */
+#define  PCI_HT_LCNF_DFOE	0x8000	/* Doubleword Flow Control Out Enable */
+
+					/* Revision ID Register */
+#define  PCI_HT_RID_MIN		0x1f	/* Minor Revision */
+#define  PCI_HT_RID_MAJ		0xe0	/* Major Revision */
+
+					/* Link Frequency/Error Register */
+#define  PCI_HT_LFRER_FREQ	0x0f	/* Transmitter Clock Frequency */
+#define  PCI_HT_LFRER_200	0x00	/* 200MHz */
+#define  PCI_HT_LFRER_300	0x01	/* 300MHz */
+#define  PCI_HT_LFRER_400	0x02	/* 400MHz */
+#define  PCI_HT_LFRER_500	0x03	/* 500MHz */
+#define  PCI_HT_LFRER_600	0x04	/* 600MHz */
+#define  PCI_HT_LFRER_800	0x05	/* 800MHz */
+#define  PCI_HT_LFRER_1000	0x06	/* 1.0GHz */
+#define  PCI_HT_LFRER_1200	0x07	/* 1.2GHz */
+#define  PCI_HT_LFRER_1400	0x08	/* 1.4GHz */
+#define  PCI_HT_LFRER_1600	0x09	/* 1.6GHz */
+#define  PCI_HT_LFRER_VEND	0x0f	/* Vendor-Specific */
+#define  PCI_HT_LFRER_ERR	0xf0	/* Link Error */
+#define  PCI_HT_LFRER_PROT	0x10	/* Protocol Error */
+#define  PCI_HT_LFRER_OV	0x20	/* Overflow Error */
+#define  PCI_HT_LFRER_EOC	0x40	/* End of Chain Error */
+#define  PCI_HT_LFRER_CTLT	0x80	/* CTL Timeout */
+
+					/* Link Frequency Capability Register */
+#define  PCI_HT_LFCAP_200	0x0001	/* 200MHz */
+#define  PCI_HT_LFCAP_300	0x0002	/* 300MHz */
+#define  PCI_HT_LFCAP_400	0x0004	/* 400MHz */
+#define  PCI_HT_LFCAP_500	0x0008	/* 500MHz */
+#define  PCI_HT_LFCAP_600	0x0010	/* 600MHz */
+#define  PCI_HT_LFCAP_800	0x0020	/* 800MHz */
+#define  PCI_HT_LFCAP_1000	0x0040	/* 1.0GHz */
+#define  PCI_HT_LFCAP_1200	0x0080	/* 1.2GHz */
+#define  PCI_HT_LFCAP_1400	0x0100	/* 1.4GHz */
+#define  PCI_HT_LFCAP_1600	0x0200	/* 1.6GHz */
+#define  PCI_HT_LFCAP_VEND	0x8000	/* Vendor-Specific */
+
+					/* Feature Register */
+#define  PCI_HT_FTR_ISOCFC	0x0001	/* Isochronous Flow Control Mode */
+#define  PCI_HT_FTR_LDTSTOP	0x0002	/* LDTSTOP# Supported */
+#define  PCI_HT_FTR_CRCTM	0x0004	/* CRC Test Mode */
+#define  PCI_HT_FTR_ECTLT	0x0008	/* Extended CTL Time Required */
+#define  PCI_HT_FTR_64BA	0x0010	/* 64-bit Addressing */
+#define  PCI_HT_FTR_UIDRD	0x0020	/* UnitID Reorder Disable */
+
+					/* Error Handling Register */
+#define  PCI_HT_EH_PFLE		0x0001	/* Protocol Error Flood Enable */
+#define  PCI_HT_EH_OFLE		0x0002	/* Overflow Error Flood Enable */
+#define  PCI_HT_EH_PFE		0x0004	/* Protocol Error Fatal Enable */
+#define  PCI_HT_EH_OFE		0x0008	/* Overflow Error Fatal Enable */
+#define  PCI_HT_EH_EOCFE	0x0010	/* End of Chain Error Fatal Enable */
+#define  PCI_HT_EH_RFE		0x0020	/* Response Error Fatal Enable */
+#define  PCI_HT_EH_CRCFE	0x0040	/* CRC Error Fatal Enable */
+#define  PCI_HT_EH_SERRFE	0x0080	/* System Error Fatal Enable (B */
+#define  PCI_HT_EH_CF		0x0100	/* Chain Fail */
+#define  PCI_HT_EH_RE		0x0200	/* Response Error */
+#define  PCI_HT_EH_PNFE		0x0400	/* Protocol Error Nonfatal Enable */
+#define  PCI_HT_EH_ONFE		0x0800	/* Overflow Error Nonfatal Enable */
+#define  PCI_HT_EH_EOCNFE	0x1000	/* End of Chain Error Nonfatal Enable */
+#define  PCI_HT_EH_RNFE		0x2000	/* Response Error Nonfatal Enable */
+#define  PCI_HT_EH_CRCNFE	0x4000	/* CRC Error Nonfatal Enable */
+#define  PCI_HT_EH_SERRNFE	0x8000	/* System Error Nonfatal Enable */
+
+/* HyperTransport: Slave or Primary Interface */
+#define PCI_HT_PRI_CMD		2	/* Command Register */
+#define  PCI_HT_PRI_CMD_BUID	0x001f	/* Base UnitID */
+#define  PCI_HT_PRI_CMD_UC	0x03e0	/* Unit Count */
+#define  PCI_HT_PRI_CMD_MH	0x0400	/* Master Host */
+#define  PCI_HT_PRI_CMD_DD	0x0800	/* Default Direction */
+#define  PCI_HT_PRI_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
+
+#define PCI_HT_PRI_LCTR0	4	/* Link Control 0 Register */
+#define PCI_HT_PRI_LCNF0	6	/* Link Config 0 Register */
+#define PCI_HT_PRI_LCTR1	8	/* Link Control 1 Register */
+#define PCI_HT_PRI_LCNF1	10	/* Link Config 1 Register */
+#define PCI_HT_PRI_RID		12	/* Revision ID Register */
+#define PCI_HT_PRI_LFRER0	13	/* Link Frequency/Error 0 Register */
+#define PCI_HT_PRI_LFCAP0	14	/* Link Frequency Capability 0 Register */
+#define PCI_HT_PRI_FTR		16	/* Feature Register */
+#define PCI_HT_PRI_LFRER1	17	/* Link Frequency/Error 1 Register */
+#define PCI_HT_PRI_LFCAP1	18	/* Link Frequency Capability 1 Register */
+#define PCI_HT_PRI_ES		20	/* Enumeration Scratchpad Register */
+#define PCI_HT_PRI_EH		22	/* Error Handling Register */
+#define PCI_HT_PRI_MBU		24	/* Memory Base Upper Register */
+#define PCI_HT_PRI_MLU		25	/* Memory Limit Upper Register */
+#define PCI_HT_PRI_BN		26	/* Bus Number Register */
+#define PCI_HT_PRI_SIZEOF	28
+
+/* HyperTransport: Host or Secondary Interface */
+#define PCI_HT_SEC_CMD		2	/* Command Register */
+#define  PCI_HT_SEC_CMD_WR	0x0001	/* Warm Reset */
+#define  PCI_HT_SEC_CMD_DE	0x0002	/* Double-Ended */
+#define  PCI_HT_SEC_CMD_DN	0x0076	/* Device Number */
+#define  PCI_HT_SEC_CMD_CS	0x0080	/* Chain Side */
+#define  PCI_HT_SEC_CMD_HH	0x0100	/* Host Hide */
+#define  PCI_HT_SEC_CMD_AS	0x0400	/* Act as Slave */
+#define  PCI_HT_SEC_CMD_HIECE	0x0800	/* Host Inbound End of Chain Error */
+#define  PCI_HT_SEC_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
+
+#define PCI_HT_SEC_LCTR		4	/* Link Control Register */
+#define PCI_HT_SEC_LCNF		6	/* Link Config Register */
+#define PCI_HT_SEC_RID		8	/* Revision ID Register */
+#define PCI_HT_SEC_LFRER	9	/* Link Frequency/Error Register */
+#define PCI_HT_SEC_LFCAP	10	/* Link Frequency Capability Register */
+#define PCI_HT_SEC_FTR		12	/* Feature Register */
+#define  PCI_HT_SEC_FTR_EXTRS	0x0100	/* Extended Register Set */
+#define  PCI_HT_SEC_FTR_UCNFE	0x0200	/* Upstream Configuration Enable */
+#define PCI_HT_SEC_ES		16	/* Enumeration Scratchpad Register */
+#define PCI_HT_SEC_EH		18	/* Error Handling Register */
+#define PCI_HT_SEC_MBU		20	/* Memory Base Upper Register */
+#define PCI_HT_SEC_MLU		21	/* Memory Limit Upper Register */
+#define PCI_HT_SEC_SIZEOF	24
+
+/* HyperTransport: Switch */
+#define PCI_HT_SW_CMD		2	/* Switch Command Register */
+#define  PCI_HT_SW_CMD_VIBERR	0x0080	/* VIB Error */
+#define  PCI_HT_SW_CMD_VIBFL	0x0100	/* VIB Flood */
+#define  PCI_HT_SW_CMD_VIBFT	0x0200	/* VIB Fatal */
+#define  PCI_HT_SW_CMD_VIBNFT	0x0400	/* VIB Nonfatal */
+#define PCI_HT_SW_PMASK		4	/* Partition Mask Register */
+#define PCI_HT_SW_SWINF		8	/* Switch Info Register */
+#define  PCI_HT_SW_SWINF_DP	0x0000001f /* Default Port */
+#define  PCI_HT_SW_SWINF_EN	0x00000020 /* Enable Decode */
+#define  PCI_HT_SW_SWINF_CR	0x00000040 /* Cold Reset */
+#define  PCI_HT_SW_SWINF_PCIDX	0x00000f00 /* Performance Counter Index */
+#define  PCI_HT_SW_SWINF_BLRIDX	0x0003f000 /* Base/Limit Range Index */
+#define  PCI_HT_SW_SWINF_SBIDX	0x00002000 /* Secondary Base Range Index */
+#define  PCI_HT_SW_SWINF_HP	0x00040000 /* Hot Plug */
+#define  PCI_HT_SW_SWINF_HIDE	0x00080000 /* Hide Port */
+#define PCI_HT_SW_PCD		12	/* Performance Counter Data Register */
+#define PCI_HT_SW_BLRD		16	/* Base/Limit Range Data Register */
+#define PCI_HT_SW_SBD		20	/* Secondary Base Data Register */
+#define PCI_HT_SW_SIZEOF	24
+
+					/* Counter indices */
+#define  PCI_HT_SW_PC_PCR	0x0	/* Posted Command Receive */
+#define  PCI_HT_SW_PC_NPCR	0x1	/* Nonposted Command Receive */
+#define  PCI_HT_SW_PC_RCR	0x2	/* Response Command Receive */
+#define  PCI_HT_SW_PC_PDWR	0x3	/* Posted DW Receive */
+#define  PCI_HT_SW_PC_NPDWR	0x4	/* Nonposted DW Receive */
+#define  PCI_HT_SW_PC_RDWR	0x5	/* Response DW Receive */
+#define  PCI_HT_SW_PC_PCT	0x6	/* Posted Command Transmit */
+#define  PCI_HT_SW_PC_NPCT	0x7	/* Nonposted Command Transmit */
+#define  PCI_HT_SW_PC_RCT	0x8	/* Response Command Transmit */
+#define  PCI_HT_SW_PC_PDWT	0x9	/* Posted DW Transmit */
+#define  PCI_HT_SW_PC_NPDWT	0xa	/* Nonposted DW Transmit */
+#define  PCI_HT_SW_PC_RDWT	0xb	/* Response DW Transmit */
+
+					/* Base/Limit Range indices */
+#define  PCI_HT_SW_BLR_BASE0_LO	0x0	/* Base 0[31:1], Enable */
+#define  PCI_HT_SW_BLR_BASE0_HI	0x1	/* Base 0 Upper */
+#define  PCI_HT_SW_BLR_LIM0_LO	0x2	/* Limit 0 Lower */
+#define  PCI_HT_SW_BLR_LIM0_HI	0x3	/* Limit 0 Upper */
+
+					/* Secondary Base indices */
+#define  PCI_HT_SW_SB_LO	0x0	/* Secondary Base[31:1], Enable */
+#define  PCI_HT_SW_S0_HI	0x1	/* Secondary Base Upper */
+
+/* HyperTransport: Interrupt Discovery and Configuration */
+#define PCI_HT_IDC_IDX		2	/* Index Register */
+#define PCI_HT_IDC_DATA		4	/* Data Register */
+#define PCI_HT_IDC_SIZEOF	8
+
+					/* Register indices */
+#define  PCI_HT_IDC_IDX_LINT	0x01	/* Last Interrupt Register */
+#define   PCI_HT_IDC_LINT	0x00ff0000 /* Last interrupt definition */
+#define  PCI_HT_IDC_IDX_IDR	0x10	/* Interrupt Definition Registers */
+					/* Low part (at index) */
+#define   PCI_HT_IDC_IDR_MASK	0x10000001 /* Mask */
+#define   PCI_HT_IDC_IDR_POL	0x10000002 /* Polarity */
+#define   PCI_HT_IDC_IDR_II_2	0x1000001c /* IntrInfo[4:2]: Message Type */
+#define   PCI_HT_IDC_IDR_II_5	0x10000020 /* IntrInfo[5]: Request EOI */
+#define   PCI_HT_IDC_IDR_II_6	0x00ffffc0 /* IntrInfo[23:6] */
+#define   PCI_HT_IDC_IDR_II_24	0xff000000 /* IntrInfo[31:24] */
+					/* High part (at index + 1) */
+#define   PCI_HT_IDC_IDR_II_32	0x00ffffff /* IntrInfo[55:32] */
+#define   PCI_HT_IDC_IDR_PASSPW	0x40000000 /* PassPW setting for messages */
+#define   PCI_HT_IDC_IDR_WEOI	0x80000000 /* Waiting for EOI */
+
+/* HyperTransport: Revision ID */
+#define PCI_HT_RID_RID		2	/* Revision Register */
+#define PCI_HT_RID_SIZEOF	4
+
+/* HyperTransport: UnitID Clumping */
+#define PCI_HT_UIDC_CS		4	/* Clumping Support Register */
+#define PCI_HT_UIDC_CE		8	/* Clumping Enable Register */
+#define PCI_HT_UIDC_SIZEOF	12
+
+/* HyperTransport: Extended Configuration Space Access */
+#define PCI_HT_ECSA_ADDR	4	/* Configuration Address Register */
+#define  PCI_HT_ECSA_ADDR_REG	0x00000ffc /* Register */
+#define  PCI_HT_ECSA_ADDR_FUN	0x00007000 /* Function */
+#define  PCI_HT_ECSA_ADDR_DEV	0x000f1000 /* Device */
+#define  PCI_HT_ECSA_ADDR_BUS	0x0ff00000 /* Bus Number */
+#define  PCI_HT_ECSA_ADDR_TYPE	0x10000000 /* Access Type */
+#define PCI_HT_ECSA_DATA	8	/* Configuration Data Register */
+#define PCI_HT_ECSA_SIZEOF	12
+
+/* HyperTransport: Address Mapping */
+#define PCI_HT_AM_CMD		2	/* Command Register */
+#define  PCI_HT_AM_CMD_NDMA	0x000f	/* Number of DMA Mappings */
+#define  PCI_HT_AM_CMD_IOSIZ	0x01f0	/* I/O Size */
+#define  PCI_HT_AM_CMD_MT	0x0600	/* Map Type */
+#define  PCI_HT_AM_CMD_MT_40B	0x0000	/* 40-bit */
+#define  PCI_HT_AM_CMD_MT_64B	0x0200	/* 64-bit */
+
+					/* Window Control Register bits */
+#define  PCI_HT_AM_SBW_CTR_COMP	0x1	/* Compat */
+#define  PCI_HT_AM_SBW_CTR_NCOH	0x2	/* NonCoherent */
+#define  PCI_HT_AM_SBW_CTR_ISOC	0x4	/* Isochronous */
+#define  PCI_HT_AM_SBW_CTR_EN	0x8	/* Enable */
+
+/* HyperTransport: 40-bit Address Mapping */
+#define PCI_HT_AM40_SBNPW	4	/* Secondary Bus Non-Prefetchable Window Register */
+#define  PCI_HT_AM40_SBW_BASE	0x000fffff /* Window Base */
+#define  PCI_HT_AM40_SBW_CTR	0xf0000000 /* Window Control */
+#define PCI_HT_AM40_SBPW	8	/* Secondary Bus Prefetchable Window Register */
+#define PCI_HT_AM40_DMA_PBASE0	12	/* DMA Window Primary Base 0 Register */
+#define PCI_HT_AM40_DMA_CTR0	15	/* DMA Window Control 0 Register */
+#define  PCI_HT_AM40_DMA_CTR_CTR 0xf0	/* Window Control */
+#define PCI_HT_AM40_DMA_SLIM0	16	/* DMA Window Secondary Limit 0 Register */
+#define PCI_HT_AM40_DMA_SBASE0	18	/* DMA Window Secondary Base 0 Register */
+#define PCI_HT_AM40_SIZEOF	12	/* size is variable: 12 + 8 * NDMA */
+
+/* HyperTransport: 64-bit Address Mapping */
+#define PCI_HT_AM64_IDX		4	/* Index Register */
+#define PCI_HT_AM64_DATA_LO	8	/* Data Lower Register */
+#define PCI_HT_AM64_DATA_HI	12	/* Data Upper Register */
+#define PCI_HT_AM64_SIZEOF	16
+
+					/* Register indices */
+#define  PCI_HT_AM64_IDX_SBNPW	0x00	/* Secondary Bus Non-Prefetchable Window Register */
+#define   PCI_HT_AM64_W_BASE_LO	0xfff00000 /* Window Base Lower */
+#define   PCI_HT_AM64_W_CTR	0x0000000f /* Window Control */
+#define  PCI_HT_AM64_IDX_SBPW	0x01	/* Secondary Bus Prefetchable Window Register */
+#define   PCI_HT_AM64_IDX_PBNPW	0x02	/* Primary Bus Non-Prefetchable Window Register */
+#define   PCI_HT_AM64_IDX_DMAPB0 0x04	/* DMA Window Primary Base 0 Register */
+#define   PCI_HT_AM64_IDX_DMASB0 0x05	/* DMA Window Secondary Base 0 Register */
+#define   PCI_HT_AM64_IDX_DMASL0 0x06	/* DMA Window Secondary Limit 0 Register */
+
+/* HyperTransport: MSI Mapping */
+#define PCI_HT_MSIM_CMD		2	/* Command Register */
+#define  PCI_HT_MSIM_CMD_EN	0x0001	/* Mapping Active */
+#define  PCI_HT_MSIM_CMD_FIXD	0x0002	/* MSI Mapping Address Fixed */
+#define PCI_HT_MSIM_ADDR_LO	4	/* MSI Mapping Address Lower Register */
+#define PCI_HT_MSIM_ADDR_HI	8	/* MSI Mapping Address Upper Register */
+#define PCI_HT_MSIM_SIZEOF	12
+
+/* HyperTransport: DirectRoute */
+#define PCI_HT_DR_CMD		2	/* Command Register */
+#define  PCI_HT_DR_CMD_NDRS	0x000f	/* Number of DirectRoute Spaces */
+#define  PCI_HT_DR_CMD_IDX	0x01f0	/* Index */
+#define PCI_HT_DR_EN		4	/* Enable Vector Register */
+#define PCI_HT_DR_DATA		8	/* Data Register */
+#define PCI_HT_DR_SIZEOF	12
+
+					/* Register indices */
+#define  PCI_HT_DR_IDX_BASE_LO	0x00	/* DirectRoute Base Lower Register */
+#define   PCI_HT_DR_OTNRD	0x00000001 /* Opposite to Normal Request Direction */
+#define   PCI_HT_DR_BL_LO	0xffffff00 /* Base/Limit Lower */
+#define  PCI_HT_DR_IDX_BASE_HI	0x01	/* DirectRoute Base Upper Register */
+#define  PCI_HT_DR_IDX_LIMIT_LO	0x02	/* DirectRoute Limit Lower Register */
+#define  PCI_HT_DR_IDX_LIMIT_HI	0x03	/* DirectRoute Limit Upper Register */
+
+/* HyperTransport: VCSet */
+#define PCI_HT_VCS_SUP		4	/* VCSets Supported Register */
+#define PCI_HT_VCS_L1EN		5	/* Link 1 VCSets Enabled Register */
+#define PCI_HT_VCS_L0EN		6	/* Link 0 VCSets Enabled Register */
+#define PCI_HT_VCS_SBD		8	/* Stream Bucket Depth Register */
+#define PCI_HT_VCS_SINT		9	/* Stream Interval Register */
+#define PCI_HT_VCS_SSUP		10	/* Number of Streaming VCs Supported Register */
+#define  PCI_HT_VCS_SSUP_0	0x00	/* Streaming VC 0 */
+#define  PCI_HT_VCS_SSUP_3	0x01	/* Streaming VCs 0-3 */
+#define  PCI_HT_VCS_SSUP_15	0x02	/* Streaming VCs 0-15 */
+#define PCI_HT_VCS_NFCBD	12	/* Non-FC Bucket Depth Register */
+#define PCI_HT_VCS_NFCINT	13	/* Non-FC Bucket Interval Register */
+#define PCI_HT_VCS_SIZEOF	16
+
+/* HyperTransport: Retry Mode */
+#define PCI_HT_RM_CTR0		4	/* Control 0 Register */
+#define  PCI_HT_RM_CTR_LRETEN	0x01	/* Link Retry Enable */
+#define  PCI_HT_RM_CTR_FSER	0x02	/* Force Single Error */
+#define  PCI_HT_RM_CTR_ROLNEN	0x04	/* Rollover Nonfatal Enable */
+#define  PCI_HT_RM_CTR_FSS	0x08	/* Force Single Stomp */
+#define  PCI_HT_RM_CTR_RETNEN	0x10	/* Retry Nonfatal Enable */
+#define  PCI_HT_RM_CTR_RETFEN	0x20	/* Retry Fatal Enable */
+#define  PCI_HT_RM_CTR_AA	0xc0	/* Allowed Attempts */
+#define PCI_HT_RM_STS0		5	/* Status 0 Register */
+#define  PCI_HT_RM_STS_RETSNT	0x01	/* Retry Sent */
+#define  PCI_HT_RM_STS_CNTROL	0x02	/* Count Rollover */
+#define  PCI_HT_RM_STS_SRCV	0x04	/* Stomp Received */
+#define PCI_HT_RM_CTR1		6	/* Control 1 Register */
+#define PCI_HT_RM_STS1		7	/* Status 1 Register */
+#define PCI_HT_RM_CNT0		8	/* Retry Count 0 Register */
+#define PCI_HT_RM_CNT1		10	/* Retry Count 1 Register */
+#define PCI_HT_RM_SIZEOF	12
+
+/* Vendor-Specific Capability (see PCI_EVNDR_xxx for the PCIe version) */
+#define PCI_VNDR_LENGTH		2	/* Length byte */
+
+/* PCI Express */
+#define PCI_EXP_FLAGS		0x2	/* Capabilities register */
+#define PCI_EXP_FLAGS_VERS	0x000f	/* Capability version */
+#define PCI_EXP_FLAGS_TYPE	0x00f0	/* Device/Port type */
+#define  PCI_EXP_TYPE_ENDPOINT	0x0	/* Express Endpoint */
+#define  PCI_EXP_TYPE_LEG_END	0x1	/* Legacy Endpoint */
+#define  PCI_EXP_TYPE_ROOT_PORT 0x4	/* Root Port */
+#define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
+#define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
+#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_PCIE_BRIDGE 0x8	/* PCI/PCI-X to PCIE Bridge */
+#define  PCI_EXP_TYPE_ROOT_INT_EP 0x9	/* Root Complex Integrated Endpoint */
+#define  PCI_EXP_TYPE_ROOT_EC 0xa	/* Root Complex Event Collector */
+#define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
+#define PCI_EXP_DEVCAP		0x4	/* Device capabilities */
+#define  PCI_EXP_DEVCAP_PAYLOAD	0x07	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCAP_PHANTOM	0x18	/* Phantom functions */
+#define  PCI_EXP_DEVCAP_EXT_TAG	0x20	/* Extended tags */
+#define  PCI_EXP_DEVCAP_L0S	0x1c0	/* L0s Acceptable Latency */
+#define  PCI_EXP_DEVCAP_L1	0xe00	/* L1 Acceptable Latency */
+#define  PCI_EXP_DEVCAP_ATN_BUT	0x1000	/* Attention Button Present */
+#define  PCI_EXP_DEVCAP_ATN_IND	0x2000	/* Attention Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_IND	0x4000	/* Power Indicator Present */
+#define  PCI_EXP_DEVCAP_RBE	0x8000	/* Role-Based Error Reporting */
+#define  PCI_EXP_DEVCAP_PWR_VAL	0x3fc0000 /* Slot Power Limit Value */
+#define  PCI_EXP_DEVCAP_PWR_SCL	0xc000000 /* Slot Power Limit Scale */
+#define  PCI_EXP_DEVCAP_FLRESET	0x10000000 /* Function-Level Reset */
+#define PCI_EXP_DEVCTL		0x8	/* Device Control */
+#define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
+#define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_FERE	0x0004	/* Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_URRE	0x0008	/* Unsupported Request Reporting En. */
+#define  PCI_EXP_DEVCTL_RELAXED	0x0010	/* Enable Relaxed Ordering */
+#define  PCI_EXP_DEVCTL_PAYLOAD	0x00e0	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_EXT_TAG	0x0100	/* Extended Tag Field Enable */
+#define  PCI_EXP_DEVCTL_PHANTOM	0x0200	/* Phantom Functions Enable */
+#define  PCI_EXP_DEVCTL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
+#define  PCI_EXP_DEVCTL_NOSNOOP	0x0800	/* Enable No Snoop */
+#define  PCI_EXP_DEVCTL_READRQ	0x7000	/* Max_Read_Request_Size */
+#define  PCI_EXP_DEVCTL_BCRE	0x8000	/* Bridge Configuration Retry Enable */
+#define  PCI_EXP_DEVCTL_FLRESET	0x8000	/* Function-Level Reset [bit shared with BCRE] */
+#define PCI_EXP_DEVSTA		0xa	/* Device Status */
+#define  PCI_EXP_DEVSTA_CED	0x01	/* Correctable Error Detected */
+#define  PCI_EXP_DEVSTA_NFED	0x02	/* Non-Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_FED	0x04	/* Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_URD	0x08	/* Unsupported Request Detected */
+#define  PCI_EXP_DEVSTA_AUXPD	0x10	/* AUX Power Detected */
+#define  PCI_EXP_DEVSTA_TRPND	0x20	/* Transactions Pending */
+#define PCI_EXP_LNKCAP		0xc	/* Link Capabilities */
+#define  PCI_EXP_LNKCAP_SPEED	0x0000f	/* Maximum Link Speed */
+#define  PCI_EXP_LNKCAP_WIDTH	0x003f0	/* Maximum Link Width */
+#define  PCI_EXP_LNKCAP_ASPM	0x00c00	/* Active State Power Management */
+#define  PCI_EXP_LNKCAP_L0S	0x07000	/* L0s Acceptable Latency */
+#define  PCI_EXP_LNKCAP_L1	0x38000	/* L1 Acceptable Latency */
+#define  PCI_EXP_LNKCAP_CLOCKPM	0x40000	/* Clock Power Management */
+#define  PCI_EXP_LNKCAP_SURPRISE 0x80000 /* Surprise Down Error Reporting */
+#define  PCI_EXP_LNKCAP_DLLA	0x100000 /* Data Link Layer Active Reporting */
+#define  PCI_EXP_LNKCAP_LBNC	0x200000 /* Link Bandwidth Notification Capability */
+#define  PCI_EXP_LNKCAP_PORT	0xff000000 /* Port Number */
+#define PCI_EXP_LNKCTL		0x10	/* Link Control */
+#define  PCI_EXP_LNKCTL_ASPM	0x0003	/* ASPM Control */
+#define  PCI_EXP_LNKCTL_RCB	0x0008	/* Read Completion Boundary */
+#define  PCI_EXP_LNKCTL_DISABLE	0x0010	/* Link Disable */
+#define  PCI_EXP_LNKCTL_RETRAIN	0x0020	/* Retrain Link */
+#define  PCI_EXP_LNKCTL_CLOCK	0x0040	/* Common Clock Configuration */
+#define  PCI_EXP_LNKCTL_XSYNCH	0x0080	/* Extended Synch */
+#define  PCI_EXP_LNKCTL_CLOCKPM	0x0100	/* Clock Power Management */
+#define  PCI_EXP_LNKCTL_HWAUTWD	0x0200	/* Hardware Autonomous Width Disable */
+#define  PCI_EXP_LNKCTL_BWMIE	0x0400	/* Bandwidth Mgmt Interrupt Enable */
+#define  PCI_EXP_LNKCTL_AUTBWIE	0x0800	/* Autonomous Bandwidth Mgmt Interrupt Enable */
+#define PCI_EXP_LNKSTA		0x12	/* Link Status */
+#define  PCI_EXP_LNKSTA_SPEED	0x000f	/* Negotiated Link Speed */
+#define  PCI_EXP_LNKSTA_WIDTH	0x03f0	/* Negotiated Link Width */
+#define  PCI_EXP_LNKSTA_TR_ERR	0x0400	/* Training Error (obsolete) */
+#define  PCI_EXP_LNKSTA_TRAIN	0x0800	/* Link Training */
+#define  PCI_EXP_LNKSTA_SL_CLK	0x1000	/* Slot Clock Configuration */
+#define  PCI_EXP_LNKSTA_DL_ACT	0x2000	/* Data Link Layer in DL_Active State */
+#define  PCI_EXP_LNKSTA_BWMGMT	0x4000	/* Bandwidth Mgmt Status */
+#define  PCI_EXP_LNKSTA_AUTBW	0x8000	/* Autonomous Bandwidth Mgmt Status */
+#define PCI_EXP_SLTCAP		0x14	/* Slot Capabilities */
+#define  PCI_EXP_SLTCAP_ATNB	0x0001	/* Attention Button Present */
+#define  PCI_EXP_SLTCAP_PWRC	0x0002	/* Power Controller Present */
+#define  PCI_EXP_SLTCAP_MRL	0x0004	/* MRL Sensor Present */
+#define  PCI_EXP_SLTCAP_ATNI	0x0008	/* Attention Indicator Present */
+#define  PCI_EXP_SLTCAP_PWRI	0x0010	/* Power Indicator Present */
+#define  PCI_EXP_SLTCAP_HPS	0x0020	/* Hot-Plug Surprise */
+#define  PCI_EXP_SLTCAP_HPC	0x0040	/* Hot-Plug Capable */
+#define  PCI_EXP_SLTCAP_PWR_VAL	0x00007f80 /* Slot Power Limit Value */
+#define  PCI_EXP_SLTCAP_PWR_SCL	0x00018000 /* Slot Power Limit Scale */
+#define  PCI_EXP_SLTCAP_INTERLOCK 0x020000 /* Electromechanical Interlock Present */
+#define  PCI_EXP_SLTCAP_NOCMDCOMP 0x040000 /* No Command Completed Support */
+#define  PCI_EXP_SLTCAP_PSN	0xfff80000 /* Physical Slot Number */
+#define PCI_EXP_SLTCTL		0x18	/* Slot Control */
+#define  PCI_EXP_SLTCTL_ATNB	0x0001	/* Attention Button Pressed Enable */
+#define  PCI_EXP_SLTCTL_PWRF	0x0002	/* Power Fault Detected Enable */
+#define  PCI_EXP_SLTCTL_MRLS	0x0004	/* MRL Sensor Changed Enable */
+#define  PCI_EXP_SLTCTL_PRSD	0x0008	/* Presence Detect Changed Enable */
+#define  PCI_EXP_SLTCTL_CMDC	0x0010	/* Command Completed Interrupt Enable */
+#define  PCI_EXP_SLTCTL_HPIE	0x0020	/* Hot-Plug Interrupt Enable */
+#define  PCI_EXP_SLTCTL_ATNI	0x00c0	/* Attention Indicator Control */
+#define  PCI_EXP_SLTCTL_PWRI	0x0300	/* Power Indicator Control */
+#define  PCI_EXP_SLTCTL_PWRC	0x0400	/* Power Controller Control */
+#define  PCI_EXP_SLTCTL_INTERLOCK 0x0800 /* Electromechanical Interlock Control */
+#define  PCI_EXP_SLTCTL_LLCHG	0x1000	/* Data Link Layer State Changed Enable */
+#define PCI_EXP_SLTSTA		0x1a	/* Slot Status */
+#define  PCI_EXP_SLTSTA_ATNB	0x0001	/* Attention Button Pressed */
+#define  PCI_EXP_SLTSTA_PWRF	0x0002	/* Power Fault Detected */
+#define  PCI_EXP_SLTSTA_MRLS	0x0004	/* MRL Sensor Changed */
+#define  PCI_EXP_SLTSTA_PRSD	0x0008	/* Presence Detect Changed */
+#define  PCI_EXP_SLTSTA_CMDC	0x0010	/* Command Completed */
+#define  PCI_EXP_SLTSTA_MRL_ST	0x0020	/* MRL Sensor State */
+#define  PCI_EXP_SLTSTA_PRES	0x0040	/* Presence Detect State */
+#define  PCI_EXP_SLTSTA_INTERLOCK 0x0080 /* Electromechanical Interlock Status */
+#define  PCI_EXP_SLTSTA_LLCHG	0x0100	/* Data Link Layer State Changed */
+#define PCI_EXP_RTCTL		0x1c	/* Root Control */
+#define  PCI_EXP_RTCTL_SECEE	0x0001	/* System Error on Correctable Error */
+#define  PCI_EXP_RTCTL_SENFEE	0x0002	/* System Error on Non-Fatal Error */
+#define  PCI_EXP_RTCTL_SEFEE	0x0004	/* System Error on Fatal Error */
+#define  PCI_EXP_RTCTL_PMEIE	0x0008	/* PME Interrupt Enable */
+#define  PCI_EXP_RTCTL_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTCAP		0x1e	/* Root Capabilities */
+#define  PCI_EXP_RTCAP_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTSTA		0x20	/* Root Status */
+#define  PCI_EXP_RTSTA_PME_REQID   0x0000ffff /* PME Requester ID */
+#define  PCI_EXP_RTSTA_PME_STATUS  0x00010000 /* PME Status */
+#define  PCI_EXP_RTSTA_PME_PENDING 0x00020000 /* PME is Pending */
+#define PCI_EXP_DEVCAP2			0x24	/* Device capabilities 2 */
+#define PCI_EXP_DEVCTL2			0x28	/* Device Control */
+#define  PCI_EXP_DEV2_TIMEOUT_RANGE(x)	((x) & 0xf) /* Completion Timeout Ranges Supported */
+#define  PCI_EXP_DEV2_TIMEOUT_VALUE(x)	((x) & 0xf) /* Completion Timeout Value */
+#define  PCI_EXP_DEV2_TIMEOUT_DIS	0x0010	/* Completion Timeout Disable Supported */
+#define  PCI_EXP_DEV2_ARI		0x0020	/* ARI Forwarding */
+#define PCI_EXP_DEVSTA2			0x2a	/* Device Status */
+#define PCI_EXP_LNKCAP2			0x2c	/* Link Capabilities */
+#define PCI_EXP_LNKCTL2			0x30	/* Link Control */
+#define  PCI_EXP_LNKCTL2_SPEED(x)	((x) & 0xf) /* Target Link Speed */
+#define  PCI_EXP_LNKCTL2_CMPLNC		0x0010	/* Enter Compliance */
+#define  PCI_EXP_LNKCTL2_SPEED_DIS	0x0020	/* Hardware Autonomous Speed Disable */
+#define  PCI_EXP_LNKCTL2_DEEMPHASIS(x)	(((x) >> 6) & 1) /* Selectable De-emphasis */
+#define  PCI_EXP_LNKCTL2_MARGIN(x)	(((x) >> 7) & 7) /* Transmit Margin */
+#define  PCI_EXP_LNKCTL2_MOD_CMPLNC	0x0400	/* Enter Modified Compliance */
+#define  PCI_EXP_LNKCTL2_CMPLNC_SOS	0x0800	/* Compliance SOS */
+#define  PCI_EXP_LNKCTL2_COM_DEEMPHASIS(x) (((x) >> 12) & 1) /* Compliance De-emphasis */
+#define PCI_EXP_LNKSTA2			0x32	/* Link Status */
+#define  PCI_EXP_LINKSTA2_DEEMPHASIS(x)	((x) & 1)	/* Current De-emphasis Level */
+#define PCI_EXP_SLTCAP2			0x34	/* Slot Capabilities */
+#define PCI_EXP_SLTCTL2			0x38	/* Slot Control */
+#define PCI_EXP_SLTSTA2			0x3a	/* Slot Status */
+
+/* MSI-X */
+#define  PCI_MSIX_ENABLE	0x8000
+#define  PCI_MSIX_MASK		0x4000
+#define  PCI_MSIX_TABSIZE	0x07ff
+#define PCI_MSIX_TABLE		4
+#define PCI_MSIX_PBA		8
+#define  PCI_MSIX_BIR		0x7
+
+/* Subsystem vendor/device ID for PCI bridges */
+#define PCI_SSVID_VENDOR	4
+#define PCI_SSVID_DEVICE	6
+
+/* PCI Advanced Features */
+#define PCI_AF_CAP		3
+#define  PCI_AF_CAP_TP		0x01
+#define  PCI_AF_CAP_FLR		0x02
+#define PCI_AF_CTRL		4
+#define  PCI_AF_CTRL_FLR	0x01
+#define PCI_AF_STATUS		5
+#define  PCI_AF_STATUS_TP	0x01
+
+/* SATA Host Bus Adapter */
+#define PCI_SATA_HBA_BARS	4
+#define PCI_SATA_HBA_REG0	8
+
+/*** Definitions of extended capabilities ***/
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
+#define  PCI_ERR_UNC_TRAIN	0x00000001	/* Undefined in PCIe rev1.1 & 2.0 spec */
+#define  PCI_ERR_UNC_DLP	0x00000010	/* Data Link Protocol */
+#define  PCI_ERR_UNC_SDES	0x00000020	/* Surprise Down Error */
+#define  PCI_ERR_UNC_POISON_TLP	0x00001000	/* Poisoned TLP */
+#define  PCI_ERR_UNC_FCP	0x00002000	/* Flow Control Protocol */
+#define  PCI_ERR_UNC_COMP_TIME	0x00004000	/* Completion Timeout */
+#define  PCI_ERR_UNC_COMP_ABORT	0x00008000	/* Completer Abort */
+#define  PCI_ERR_UNC_UNX_COMP	0x00010000	/* Unexpected Completion */
+#define  PCI_ERR_UNC_RX_OVER	0x00020000	/* Receiver Overflow */
+#define  PCI_ERR_UNC_MALF_TLP	0x00040000	/* Malformed TLP */
+#define  PCI_ERR_UNC_ECRC	0x00080000	/* ECRC Error Status */
+#define  PCI_ERR_UNC_UNSUP	0x00100000	/* Unsupported Request */
+#define  PCI_ERR_UNC_ACS_VIOL	0x00200000	/* ACS Violation */
+#define PCI_ERR_UNCOR_MASK	8	/* Uncorrectable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER	12	/* Uncorrectable Error Severity */
+	/* Same bits as above */
+#define PCI_ERR_COR_STATUS	16	/* Correctable Error Status */
+#define  PCI_ERR_COR_RCVR	0x00000001	/* Receiver Error Status */
+#define  PCI_ERR_COR_BAD_TLP	0x00000040	/* Bad TLP Status */
+#define  PCI_ERR_COR_BAD_DLLP	0x00000080	/* Bad DLLP Status */
+#define  PCI_ERR_COR_REP_ROLL	0x00000100	/* REPLAY_NUM Rollover */
+#define  PCI_ERR_COR_REP_TIMER	0x00001000	/* Replay Timer Timeout */
+#define  PCI_ERR_COR_REP_ANFE	0x00002000	/* Advisory Non-Fatal Error */
+#define PCI_ERR_COR_MASK	20	/* Correctable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_CAP		24	/* Advanced Error Capabilities */
+#define  PCI_ERR_CAP_FEP(x)	((x) & 31)	/* First Error Pointer */
+#define  PCI_ERR_CAP_ECRC_GENC	0x00000020	/* ECRC Generation Capable */
+#define  PCI_ERR_CAP_ECRC_GENE	0x00000040	/* ECRC Generation Enable */
+#define  PCI_ERR_CAP_ECRC_CHKC	0x00000080	/* ECRC Check Capable */
+#define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
+#define PCI_ERR_ROOT_STATUS	48
+#define PCI_ERR_ROOT_COR_SRC	52
+#define PCI_ERR_ROOT_SRC	54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1	4
+#define PCI_VC_PORT_REG2	8
+#define PCI_VC_PORT_CTRL	12
+#define PCI_VC_PORT_STATUS	14
+#define PCI_VC_RES_CAP		16
+#define PCI_VC_RES_CTRL		20
+#define PCI_VC_RES_STATUS	26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR		4	/* Data Select Register */
+#define PCI_PWR_DATA		8	/* Data Register */
+#define  PCI_PWR_DATA_BASE(x)	((x) & 0xff)	    /* Base Power */
+#define  PCI_PWR_DATA_SCALE(x)	(((x) >> 8) & 3)    /* Data Scale */
+#define  PCI_PWR_DATA_PM_SUB(x)	(((x) >> 10) & 7)   /* PM Sub State */
+#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define  PCI_PWR_DATA_TYPE(x)	(((x) >> 15) & 7)   /* Type */
+#define  PCI_PWR_DATA_RAIL(x)	(((x) >> 18) & 7)   /* Power Rail */
+#define PCI_PWR_CAP		12	/* Capability */
+#define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
+
+/* Root Complex Link */
+#define PCI_RCLINK_ESD		4	/* Element Self Description */
+#define PCI_RCLINK_LINK1	16	/* First Link Entry */
+#define  PCI_RCLINK_LINK_DESC	0	/* Link Entry: Description */
+#define  PCI_RCLINK_LINK_ADDR	8	/* Link Entry: Address (64-bit) */
+#define  PCI_RCLINK_LINK_SIZE	16	/* Link Entry: sizeof */
+
+/* PCIe Vendor-Specific Capability */
+#define PCI_EVNDR_HEADER	4	/* Vendor-Specific Header */
+#define PCI_EVNDR_REGISTERS	8	/* Vendor-Specific Registers */
+
+/* Access Control Services */
+#define PCI_ACS_CAP		0x04	/* ACS Capability Register */
+#define PCI_ACS_CAP_VALID	0x0001	/* ACS Source Validation */
+#define PCI_ACS_CAP_BLOCK	0x0002	/* ACS Translation Blocking */
+#define PCI_ACS_CAP_REQ_RED	0x0004	/* ACS P2P Request Redirect */
+#define PCI_ACS_CAP_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect */
+#define PCI_ACS_CAP_FORWARD	0x0010	/* ACS Upstream Forwarding */
+#define PCI_ACS_CAP_EGRESS	0x0020	/* ACS P2P Egress Control */
+#define PCI_ACS_CAP_TRANS	0x0040	/* ACS Direct Translated P2P */
+#define PCI_ACS_CAP_VECTOR(x)	(((x) >> 8) & 0xff) /* Egress Control Vector Size */
+#define PCI_ACS_CTRL		0x06	/* ACS Control Register */
+#define PCI_ACS_CTRL_VALID	0x0001	/* ACS Source Validation Enable */
+#define PCI_ACS_CTRL_BLOCK	0x0002	/* ACS Translation Blocking Enable */
+#define PCI_ACS_CTRL_REQ_RED	0x0004	/* ACS P2P Request Redirect Enable */
+#define PCI_ACS_CTRL_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect Enable */
+#define PCI_ACS_CTRL_FORWARD	0x0010	/* ACS Upstream Forwarding Enable */
+#define PCI_ACS_CTRL_EGRESS	0x0020	/* ACS P2P Egress Control Enable */
+#define PCI_ACS_CTRL_TRANS	0x0040	/* ACS Direct Translated P2P Enable */
+#define PCI_ACS_EGRESS_CTRL	0x08	/* Egress Control Vector */
+
+/* Alternative Routing-ID Interpretation */
+#define PCI_ARI_CAP		0x04	/* ARI Capability Register */
+#define  PCI_ARI_CAP_MFVC	0x0001	/* MFVC Function Groups Capability */
+#define  PCI_ARI_CAP_ACS	0x0002	/* ACS Function Groups Capability */
+#define  PCI_ARI_CAP_NFN(x)	(((x) >> 8) & 0xff) /* Next Function Number */
+#define PCI_ARI_CTRL		0x06	/* ARI Control Register */
+#define  PCI_ARI_CTRL_MFVC	0x0001	/* MFVC Function Groups Enable */
+#define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
+#define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
+
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_IQD(x)	((x) & 0x1f) /* Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f) /* Smallest Translation Unit */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+
+/* Single Root I/O Virtualization */
+#define PCI_IOV_CAP		0x04	/* SR-IOV Capability Register */
+#define  PCI_IOV_CAP_VFM	0x00000001 /* VF Migration Capable */
+#define  PCI_IOV_CAP_IMN(x)	((x) >> 21) /* VF Migration Interrupt Message Number */
+#define PCI_IOV_CTRL		0x08	/* SR-IOV Control Register */
+#define  PCI_IOV_CTRL_VFE	0x0001	/* VF Enable */
+#define  PCI_IOV_CTRL_VFME	0x0002	/* VF Migration Enable */
+#define  PCI_IOV_CTRL_VFMIE	0x0004	/* VF Migration Interrupt Enable */
+#define  PCI_IOV_CTRL_MSE	0x0008	/* VF MSE */
+#define  PCI_IOV_CTRL_ARI	0x0010	/* ARI Capable Hierarchy */
+#define PCI_IOV_STATUS		0x0a	/* SR-IOV Status Register */
+#define  PCI_IOV_STATUS_MS	0x0001	/* VF Migration Status */
+#define PCI_IOV_INITIALVF	0x0c	/* Number of VFs that are initially associated */
+#define PCI_IOV_TOTALVF		0x0e	/* Maximum number of VFs that could be associated */
+#define PCI_IOV_NUMVF		0x10	/* Number of VFs that are available */
+#define PCI_IOV_FDL		0x12	/* Function Dependency Link */
+#define PCI_IOV_OFFSET		0x14	/* First VF Offset */
+#define PCI_IOV_STRIDE		0x16	/* Routing ID offset from one VF to the next one */
+#define PCI_IOV_DID		0x1a	/* VF Device ID */
+#define PCI_IOV_SUPPS		0x1c	/* Supported Page Sizes */
+#define PCI_IOV_SYSPS		0x20	/* System Page Size */
+#define PCI_IOV_BAR_BASE	0x24	/* VF BAR0, VF BAR1, ... VF BAR5 */
+#define PCI_IOV_NUM_BAR		6	/* Number of VF BARs */
+#define PCI_IOV_MSAO		0x3c	/* VF Migration State Array Offset */
+#define PCI_IOV_MSA_BIR(x)	((x) & 7) /* VF Migration State BIR */
+#define PCI_IOV_MSA_OFFSET(x)	((x) & 0xfffffff8) /* VF Migration State Offset */
+
+/* Transaction Processing Hints */
+#define PCI_TPH_CAPABILITIES	4
+#define   PCI_TPH_INTVEC_SUP	(1<<1)	/* Supports interrupt vector mode */
+#define   PCI_TPH_DEV_SUP      	(1<<2)	/* Device specific mode supported */
+#define   PCI_TPH_EXT_REQ_SUP	(1<<8)	/* Supports extended requests */
+#define   PCI_TPH_ST_LOC_MASK	(3<<9)	/* Steering table location bits */
+#define     PCI_TPH_ST_NONE	(0<<9)	/* No steering table */
+#define     PCI_TPH_ST_CAP	(1<<9)	/* Steering table in TPH cap */
+#define     PCI_TPH_ST_MSIX	(2<<9)	/* Steering table in MSI-X table */
+#define   PCI_TPH_ST_SIZE_SHIFT	(16)	/* Encoded as size - 1 */
+
+/* Latency Tolerance Reporting */
+#define PCI_LTR_MAX_SNOOP	4	/* 16 bit value */
+#define   PCI_LTR_VALUE_MASK	(0x3ff)
+#define   PCI_LTR_SCALE_SHIFT	(10)
+#define   PCI_LTR_SCALE_MASK	(7)
+#define PCI_LTR_MAX_NOSNOOP	6	/* 16 bit value */
+
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices.  The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ *	7:3 = slot
+ *	2:0 = function
+ */
+#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn)		((devfn) & 0x07)
+
+/* Device classes and subclasses */
+
+#define PCI_CLASS_NOT_DEFINED		0x0000
+#define PCI_CLASS_NOT_DEFINED_VGA	0x0001
+
+#define PCI_BASE_CLASS_STORAGE		0x01
+#define PCI_CLASS_STORAGE_SCSI		0x0100
+#define PCI_CLASS_STORAGE_IDE		0x0101
+#define PCI_CLASS_STORAGE_FLOPPY	0x0102
+#define PCI_CLASS_STORAGE_IPI		0x0103
+#define PCI_CLASS_STORAGE_RAID		0x0104
+#define PCI_CLASS_STORAGE_ATA		0x0105
+#define PCI_CLASS_STORAGE_SATA		0x0106
+#define PCI_CLASS_STORAGE_SAS		0x0107
+#define PCI_CLASS_STORAGE_OTHER		0x0180
+
+#define PCI_BASE_CLASS_NETWORK		0x02
+#define PCI_CLASS_NETWORK_ETHERNET	0x0200
+#define PCI_CLASS_NETWORK_TOKEN_RING	0x0201
+#define PCI_CLASS_NETWORK_FDDI		0x0202
+#define PCI_CLASS_NETWORK_ATM		0x0203
+#define PCI_CLASS_NETWORK_ISDN		0x0204
+#define PCI_CLASS_NETWORK_OTHER		0x0280
+
+#define PCI_BASE_CLASS_DISPLAY		0x03
+#define PCI_CLASS_DISPLAY_VGA		0x0300
+#define PCI_CLASS_DISPLAY_XGA		0x0301
+#define PCI_CLASS_DISPLAY_3D		0x0302
+#define PCI_CLASS_DISPLAY_OTHER		0x0380
+
+#define PCI_BASE_CLASS_MULTIMEDIA	0x04
+#define PCI_CLASS_MULTIMEDIA_VIDEO	0x0400
+#define PCI_CLASS_MULTIMEDIA_AUDIO	0x0401
+#define PCI_CLASS_MULTIMEDIA_PHONE	0x0402
+#define PCI_CLASS_MULTIMEDIA_AUDIO_DEV	0x0403
+#define PCI_CLASS_MULTIMEDIA_OTHER	0x0480
+
+#define PCI_BASE_CLASS_MEMORY		0x05
+#define  PCI_CLASS_MEMORY_RAM		0x0500
+#define  PCI_CLASS_MEMORY_FLASH		0x0501
+#define  PCI_CLASS_MEMORY_OTHER		0x0580
+
+#define PCI_BASE_CLASS_BRIDGE		0x06
+#define  PCI_CLASS_BRIDGE_HOST		0x0600
+#define  PCI_CLASS_BRIDGE_ISA		0x0601
+#define  PCI_CLASS_BRIDGE_EISA		0x0602
+#define  PCI_CLASS_BRIDGE_MC		0x0603
+#define  PCI_CLASS_BRIDGE_PCI		0x0604
+#define  PCI_CLASS_BRIDGE_PCMCIA	0x0605
+#define  PCI_CLASS_BRIDGE_NUBUS		0x0606
+#define  PCI_CLASS_BRIDGE_CARDBUS	0x0607
+#define  PCI_CLASS_BRIDGE_RACEWAY	0x0608
+#define  PCI_CLASS_BRIDGE_PCI_SEMI	0x0609
+#define  PCI_CLASS_BRIDGE_IB_TO_PCI	0x060a
+#define  PCI_CLASS_BRIDGE_OTHER		0x0680
+
+#define PCI_BASE_CLASS_COMMUNICATION	0x07
+#define PCI_CLASS_COMMUNICATION_SERIAL	0x0700
+#define PCI_CLASS_COMMUNICATION_PARALLEL 0x0701
+#define PCI_CLASS_COMMUNICATION_MSERIAL	0x0702
+#define PCI_CLASS_COMMUNICATION_MODEM	0x0703
+#define PCI_CLASS_COMMUNICATION_OTHER	0x0780
+
+#define PCI_BASE_CLASS_SYSTEM		0x08
+#define PCI_CLASS_SYSTEM_PIC		0x0800
+#define PCI_CLASS_SYSTEM_DMA		0x0801
+#define PCI_CLASS_SYSTEM_TIMER		0x0802
+#define PCI_CLASS_SYSTEM_RTC		0x0803
+#define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
+#define PCI_CLASS_SYSTEM_OTHER		0x0880
+
+#define PCI_BASE_CLASS_INPUT		0x09
+#define PCI_CLASS_INPUT_KEYBOARD	0x0900
+#define PCI_CLASS_INPUT_PEN		0x0901
+#define PCI_CLASS_INPUT_MOUSE		0x0902
+#define PCI_CLASS_INPUT_SCANNER		0x0903
+#define PCI_CLASS_INPUT_GAMEPORT	0x0904
+#define PCI_CLASS_INPUT_OTHER		0x0980
+
+#define PCI_BASE_CLASS_DOCKING		0x0a
+#define PCI_CLASS_DOCKING_GENERIC	0x0a00
+#define PCI_CLASS_DOCKING_OTHER		0x0a80
+
+#define PCI_BASE_CLASS_PROCESSOR	0x0b
+#define PCI_CLASS_PROCESSOR_386		0x0b00
+#define PCI_CLASS_PROCESSOR_486		0x0b01
+#define PCI_CLASS_PROCESSOR_PENTIUM	0x0b02
+#define PCI_CLASS_PROCESSOR_ALPHA	0x0b10
+#define PCI_CLASS_PROCESSOR_POWERPC	0x0b20
+#define PCI_CLASS_PROCESSOR_MIPS	0x0b30
+#define PCI_CLASS_PROCESSOR_CO		0x0b40
+
+#define PCI_BASE_CLASS_SERIAL		0x0c
+#define PCI_CLASS_SERIAL_FIREWIRE	0x0c00
+#define PCI_CLASS_SERIAL_ACCESS		0x0c01
+#define PCI_CLASS_SERIAL_SSA		0x0c02
+#define PCI_CLASS_SERIAL_USB		0x0c03
+#define PCI_CLASS_SERIAL_FIBER		0x0c04
+#define PCI_CLASS_SERIAL_SMBUS		0x0c05
+#define PCI_CLASS_SERIAL_INFINIBAND	0x0c06
+
+#define PCI_BASE_CLASS_WIRELESS		0x0d
+#define PCI_CLASS_WIRELESS_IRDA		0x0d00
+#define PCI_CLASS_WIRELESS_CONSUMER_IR	0x0d01
+#define PCI_CLASS_WIRELESS_RF		0x0d10
+#define PCI_CLASS_WIRELESS_OTHER	0x0d80
+
+#define PCI_BASE_CLASS_INTELLIGENT	0x0e
+#define PCI_CLASS_INTELLIGENT_I2O	0x0e00
+
+#define PCI_BASE_CLASS_SATELLITE	0x0f
+#define PCI_CLASS_SATELLITE_TV		0x0f00
+#define PCI_CLASS_SATELLITE_AUDIO	0x0f01
+#define PCI_CLASS_SATELLITE_VOICE	0x0f03
+#define PCI_CLASS_SATELLITE_DATA	0x0f04
+
+#define PCI_BASE_CLASS_CRYPT		0x10
+#define PCI_CLASS_CRYPT_NETWORK		0x1000
+#define PCI_CLASS_CRYPT_ENTERTAINMENT	0x1010
+#define PCI_CLASS_CRYPT_OTHER		0x1080
+
+#define PCI_BASE_CLASS_SIGNAL		0x11
+#define PCI_CLASS_SIGNAL_DPIO		0x1100
+#define PCI_CLASS_SIGNAL_PERF_CTR	0x1101
+#define PCI_CLASS_SIGNAL_SYNCHRONIZER	0x1110
+#define PCI_CLASS_SIGNAL_OTHER		0x1180
+
+#define PCI_CLASS_OTHERS		0xff
+
+/* Several ID's we need in the library */
+
+#define PCI_VENDOR_ID_INTEL		0x8086
+#define PCI_VENDOR_ID_COMPAQ		0x0e11
diff --git a/ext/hwloc/include/pci/pci.h b/ext/hwloc/include/pci/pci.h
new file mode 100644
index 0000000..7a5a6b8
--- /dev/null
+++ b/ext/hwloc/include/pci/pci.h
@@ -0,0 +1,240 @@
+/*
+ *	The PCI Library
+ *
+ *	Copyright (c) 1997--2009 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _PCI_LIB_H
+#define _PCI_LIB_H
+
+#ifndef PCI_CONFIG_H
+#include "config.h"
+#endif
+
+#include "header.h"
+#include "types.h"
+
+#define PCI_LIB_VERSION 0x030100
+
+#ifndef PCI_ABI
+#define PCI_ABI
+#endif
+
+/*
+ *	PCI Access Structure
+ */
+
+struct pci_methods;
+
+enum pci_access_type {
+  /* Known access methods, remember to update access.c as well */
+  PCI_ACCESS_AUTO,			/* Autodetection */
+  PCI_ACCESS_SYS_BUS_PCI,		/* Linux /sys/bus/pci */
+  PCI_ACCESS_PROC_BUS_PCI,		/* Linux /proc/bus/pci */
+  PCI_ACCESS_I386_TYPE1,		/* i386 ports, type 1 */
+  PCI_ACCESS_I386_TYPE2,		/* i386 ports, type 2 */
+  PCI_ACCESS_FBSD_DEVICE,		/* FreeBSD /dev/pci */
+  PCI_ACCESS_AIX_DEVICE,		/* /dev/pci0, /dev/bus0, etc. */
+  PCI_ACCESS_NBSD_LIBPCI,		/* NetBSD libpci */
+  PCI_ACCESS_OBSD_DEVICE,		/* OpenBSD /dev/pci */
+  PCI_ACCESS_DUMP,			/* Dump file */
+  PCI_ACCESS_MAX
+};
+
+struct pci_access {
+  /* Options you can change: */
+  unsigned int method;			/* Access method */
+  int writeable;			/* Open in read/write mode */
+  int buscentric;			/* Bus-centric view of the world */
+
+  char *id_file_name;			/* Name of ID list file (use pci_set_name_list_path()) */
+  int free_id_name;			/* Set if id_file_name is malloced */
+  int numeric_ids;			/* Enforce PCI_LOOKUP_NUMERIC (>1 => PCI_LOOKUP_MIXED) */
+
+  unsigned int id_lookup_mode;		/* pci_lookup_mode flags which are set automatically */
+					/* Default: PCI_LOOKUP_CACHE */
+
+  int debugging;			/* Turn on debugging messages */
+
+  /* Functions you can override: */
+  void (*error)(char *msg, ...) PCI_PRINTF(1,2);	/* Write error message and quit */
+  void (*warning)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a warning message */
+  void (*debug)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a debugging message */
+
+  struct pci_dev *devices;		/* Devices found on this bus */
+
+  /* Fields used internally: */
+  struct pci_methods *methods;
+  struct pci_param *params;
+  struct id_entry **id_hash;		/* names.c */
+  struct id_bucket *current_id_bucket;
+  int id_load_failed;
+  int id_cache_status;			/* 0=not read, 1=read, 2=dirty */
+  int fd;				/* proc/sys: fd for config space */
+  int fd_rw;				/* proc/sys: fd opened read-write */
+  int fd_pos;				/* proc/sys: current position */
+  int fd_vpd;				/* sys: fd for VPD */
+  struct pci_dev *cached_dev;		/* proc/sys: device the fds are for */
+};
+
+/* Initialize PCI access */
+struct pci_access *pci_alloc(void) PCI_ABI;
+void pci_init(struct pci_access *) PCI_ABI;
+void pci_cleanup(struct pci_access *) PCI_ABI;
+
+/* Scanning of devices */
+void pci_scan_bus(struct pci_access *acc) PCI_ABI;
+struct pci_dev *pci_get_dev(struct pci_access *acc, int domain, int bus, int dev, int func) PCI_ABI; /* Raw access to specified device */
+void pci_free_dev(struct pci_dev *) PCI_ABI;
+
+/* Names of access methods */
+int pci_lookup_method(char *name) PCI_ABI;	/* Returns -1 if not found */
+char *pci_get_method_name(int index) PCI_ABI;	/* Returns "" if unavailable, NULL if index out of range */
+
+/*
+ *	Named parameters
+ */
+
+struct pci_param {
+  struct pci_param *next;		/* Please use pci_walk_params() for traversing the list */
+  char *param;				/* Name of the parameter */
+  char *value;				/* Value of the parameter */
+  int value_malloced;			/* used internally */
+  char *help;				/* Explanation of the parameter */
+};
+
+char *pci_get_param(struct pci_access *acc, char *param) PCI_ABI;
+int pci_set_param(struct pci_access *acc, char *param, char *value) PCI_ABI;	/* 0 on success, -1 if no such parameter */
+/* To traverse the list, call pci_walk_params repeatedly, first with prev=NULL, and do not modify the parameters during traversal. */
+struct pci_param *pci_walk_params(struct pci_access *acc, struct pci_param *prev) PCI_ABI;
+
+/*
+ *	Devices
+ */
+
+struct pci_dev {
+  struct pci_dev *next;			/* Next device in the chain */
+  u16 domain;				/* PCI domain (host bridge) */
+  u8 bus, dev, func;			/* Bus inside domain, device and function */
+
+  /* These fields are set by pci_fill_info() */
+  int known_fields;			/* Set of info fields already known */
+  u16 vendor_id, device_id;		/* Identity of the device */
+  u16 device_class;			/* PCI device class */
+  int irq;				/* IRQ number */
+  pciaddr_t base_addr[6];		/* Base addresses including flags in lower bits */
+  pciaddr_t size[6];			/* Region sizes */
+  pciaddr_t rom_base_addr;		/* Expansion ROM base address */
+  pciaddr_t rom_size;			/* Expansion ROM size */
+  struct pci_cap *first_cap;		/* List of capabilities */
+  char *phy_slot;			/* Physical slot */
+
+  /* Fields used internally: */
+  struct pci_access *access;
+  struct pci_methods *methods;
+  u8 *cache;				/* Cached config registers */
+  int cache_len;
+  int hdrtype;				/* Cached low 7 bits of header type, -1 if unknown */
+  void *aux;				/* Auxillary data */
+};
+
+#define PCI_ADDR_IO_MASK (~(pciaddr_t) 0x3)
+#define PCI_ADDR_MEM_MASK (~(pciaddr_t) 0xf)
+#define PCI_ADDR_FLAG_MASK 0xf
+
+u8 pci_read_byte(struct pci_dev *, int pos) PCI_ABI; /* Access to configuration space */
+u16 pci_read_word(struct pci_dev *, int pos) PCI_ABI;
+u32 pci_read_long(struct pci_dev *, int pos) PCI_ABI;
+int pci_read_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+int pci_read_vpd(struct pci_dev *d, int pos, u8 *buf, int len) PCI_ABI;
+int pci_write_byte(struct pci_dev *, int pos, u8 data) PCI_ABI;
+int pci_write_word(struct pci_dev *, int pos, u16 data) PCI_ABI;
+int pci_write_long(struct pci_dev *, int pos, u32 data) PCI_ABI;
+int pci_write_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+
+int pci_fill_info(struct pci_dev *, int flags) PCI_ABI; /* Fill in device information */
+
+#define PCI_FILL_IDENT		1
+#define PCI_FILL_IRQ		2
+#define PCI_FILL_BASES		4
+#define PCI_FILL_ROM_BASE	8
+#define PCI_FILL_SIZES		16
+#define PCI_FILL_CLASS		32
+#define PCI_FILL_CAPS		64
+#define PCI_FILL_EXT_CAPS	128
+#define PCI_FILL_PHYS_SLOT	256
+#define PCI_FILL_RESCAN		0x10000
+
+void pci_setup_cache(struct pci_dev *, u8 *cache, int len) PCI_ABI;
+
+/*
+ *	Capabilities
+ */
+
+struct pci_cap {
+  struct pci_cap *next;
+  u16 id;				/* PCI_CAP_ID_xxx */
+  u16 type;				/* PCI_CAP_xxx */
+  unsigned int addr;			/* Position in the config space */
+};
+
+#define PCI_CAP_NORMAL		1	/* Traditional PCI capabilities */
+#define PCI_CAP_EXTENDED	2	/* PCIe extended capabilities */
+
+struct pci_cap *pci_find_cap(struct pci_dev *, unsigned int id, unsigned int type) PCI_ABI;
+
+/*
+ *	Filters
+ */
+
+struct pci_filter {
+  int domain, bus, slot, func;			/* -1 = ANY */
+  int vendor, device;
+};
+
+void pci_filter_init(struct pci_access *, struct pci_filter *) PCI_ABI;
+char *pci_filter_parse_slot(struct pci_filter *, char *) PCI_ABI;
+char *pci_filter_parse_id(struct pci_filter *, char *) PCI_ABI;
+int pci_filter_match(struct pci_filter *, struct pci_dev *) PCI_ABI;
+
+/*
+ *	Conversion of PCI ID's to names (according to the pci.ids file)
+ *
+ *	Call pci_lookup_name() to identify different types of ID's:
+ *
+ *	VENDOR				(vendorID) -> vendor
+ *	DEVICE				(vendorID, deviceID) -> device
+ *	VENDOR | DEVICE			(vendorID, deviceID) -> combined vendor and device
+ *	SUBSYSTEM | VENDOR		(subvendorID) -> subsystem vendor
+ *	SUBSYSTEM | DEVICE		(vendorID, deviceID, subvendorID, subdevID) -> subsystem device
+ *	SUBSYSTEM | VENDOR | DEVICE	(vendorID, deviceID, subvendorID, subdevID) -> combined subsystem v+d
+ *	SUBSYSTEM | ...			(-1, -1, subvendorID, subdevID) -> generic subsystem
+ *	CLASS				(classID) -> class
+ *	PROGIF				(classID, progif) -> programming interface
+ */
+
+char *pci_lookup_name(struct pci_access *a, char *buf, int size, int flags, ...) PCI_ABI;
+
+int pci_load_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_lookup_*() when needed; returns success */
+void pci_free_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_cleanup() */
+void pci_set_name_list_path(struct pci_access *a, char *name, int to_be_freed) PCI_ABI;
+void pci_id_cache_flush(struct pci_access *a) PCI_ABI;
+
+enum pci_lookup_mode {
+  PCI_LOOKUP_VENDOR = 1,		/* Vendor name (args: vendorID) */
+  PCI_LOOKUP_DEVICE = 2,		/* Device name (args: vendorID, deviceID) */
+  PCI_LOOKUP_CLASS = 4,			/* Device class (args: classID) */
+  PCI_LOOKUP_SUBSYSTEM = 8,
+  PCI_LOOKUP_PROGIF = 16,		/* Programming interface (args: classID, prog_if) */
+  PCI_LOOKUP_NUMERIC = 0x10000,		/* Want only formatted numbers; default if access->numeric_ids is set */
+  PCI_LOOKUP_NO_NUMBERS = 0x20000,	/* Return NULL if not found in the database; default is to print numerically */
+  PCI_LOOKUP_MIXED = 0x40000,		/* Include both numbers and names */
+  PCI_LOOKUP_NETWORK = 0x80000,		/* Try to resolve unknown ID's by DNS */
+  PCI_LOOKUP_SKIP_LOCAL = 0x100000,	/* Do not consult local database */
+  PCI_LOOKUP_CACHE = 0x200000,		/* Consult the local cache before using DNS */
+  PCI_LOOKUP_REFRESH_CACHE = 0x400000,	/* Forget all previously cached entries, but still allow updating the cache */
+};
+
+#endif
diff --git a/ext/hwloc/include/pci/types.h b/ext/hwloc/include/pci/types.h
new file mode 100644
index 0000000..4d23e69
--- /dev/null
+++ b/ext/hwloc/include/pci/types.h
@@ -0,0 +1,65 @@
+/*
+ *	The PCI Library -- Types and Format Strings
+ *
+ *	Copyright (c) 1997--2008 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include <sys/types.h>
+
+#ifndef PCI_HAVE_Uxx_TYPES
+
+#ifdef PCI_OS_WINDOWS
+#include <windef.h>
+typedef BYTE u8;
+typedef WORD u16;
+typedef DWORD u32;
+#elif defined(PCI_HAVE_STDINT_H) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#include <stdint.h>
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+#else
+typedef u_int8_t u8;
+typedef u_int16_t u16;
+typedef u_int32_t u32;
+#endif
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+#include <limits.h>
+#if ULONG_MAX > 0xffffffff
+typedef unsigned long u64;
+#define PCI_U64_FMT "l"
+#else
+typedef unsigned long long u64;
+#define PCI_U64_FMT "ll"
+#endif
+#endif
+
+#endif	/* PCI_HAVE_Uxx_TYPES */
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+typedef u64 pciaddr_t;
+#define PCIADDR_T_FMT "%08" PCI_U64_FMT "x"
+#define PCIADDR_PORT_FMT "%04" PCI_U64_FMT "x"
+#else
+typedef u32 pciaddr_t;
+#define PCIADDR_T_FMT "%08x"
+#define PCIADDR_PORT_FMT "%04x"
+#endif
+
+#ifdef PCI_ARCH_SPARC64
+/* On sparc64 Linux the kernel reports remapped port addresses and IRQ numbers */
+#undef PCIADDR_PORT_FMT
+#define PCIADDR_PORT_FMT PCIADDR_T_FMT
+#define PCIIRQ_FMT "%08x"
+#else
+#define PCIIRQ_FMT "%d"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ > 2
+#define PCI_PRINTF(x,y) __attribute__((format(printf, x, y)))
+#else
+#define PCI_PRINTF(x,y)
+#endif
diff --git a/ext/hwloc/include/private/autogen/README.txt b/ext/hwloc/include/private/autogen/README.txt
new file mode 100644
index 0000000..17f7f60
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/README.txt
@@ -0,0 +1,3 @@
+This directory needs to exist in the repo so that the Autotools can
+generate a file here.  We have a put a token file in this directory so
+that git doesn't ignore the empty directory in the repository.
diff --git a/ext/hwloc/include/private/autogen/config.h b/ext/hwloc/include/private/autogen/config.h
new file mode 100644
index 0000000..966fa78
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/config.h
@@ -0,0 +1,772 @@
+/* include/private/autogen/config.h.  Generated from config.h.in by configure.  */
+/* include/private/autogen/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* -*- c -*-
+ *
+ * Copyright © 2009, 2011, 2012 CNRS, inria., Université Bordeaux  All rights reserved.
+ * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This file is automatically generated by configure.  Edits will be lost
+ * the next time you run configure!
+ */
+
+#ifndef HWLOC_CONFIGURE_H
+#define HWLOC_CONFIGURE_H
+
+
+/* Define to 1 if gcc's __atomic builtins are available */
+/* #undef HAVE_ATOMIC_BUILTINS */
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+/* #undef HAVE_CACHE_DESCRIPTOR */
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+/* #undef HAVE_CACHE_RELATIONSHIP */
+
+/* Define to 1 if you have the `close' function. */
+#define HAVE_CLOSE 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+/* #undef HAVE_CL_CL_EXT_H */
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if you have the <ctype.h> header file. */
+#define HAVE_CTYPE_H 1
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+   */
+#define HAVE_DECL_CTL_HW 0
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+   */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `getexecname', and to 0 if you
+   don't. */
+#define HAVE_DECL_GETEXECNAME 0
+
+/* Define to 1 if you have the declaration of `GetModuleFileName', and to 0 if
+   you don't. */
+#define HAVE_DECL_GETMODULEFILENAME 0
+
+/* Define to 1 if you have the declaration of `getprogname', and to 0 if you
+   don't. */
+#define HAVE_DECL_GETPROGNAME 0
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+   don't. */
+#define HAVE_DECL_HW_NCPU 0
+
+/* Define to 1 if you have the declaration of
+   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRTOULL 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGESIZE 1
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 1
+
+/* Define to 1 if you have the <dirent.h> header file. */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+/* #undef HAVE_GROUP_AFFINITY */
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+/* #undef HAVE_GROUP_RELATIONSHIP */
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+/* #undef HAVE_KAFFINITY */
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Define to 1 if we have -lgdi32 */
+/* #undef HAVE_LIBGDI32 */
+
+/* Define to 1 if we have -libverbs */
+/* #undef HAVE_LIBIBVERBS */
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the <libudev.h> header file. */
+/* #undef HAVE_LIBUDEV_H */
+
+/* Define to 1 if you have the `localeconv' function. */
+#define HAVE_LOCALECONV 1
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type 'long long int'. */
+#define HAVE_LONG_LONG_INT 1
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+#define HAVE_MEMALIGN 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if we have -lmyriexpress */
+/* #undef HAVE_MYRIEXPRESS */
+
+/* Define to 1 if you have the <myriexpress.h> header file. */
+/* #undef HAVE_MYRIEXPRESS_H */
+
+/* Define to 1 if you have the `nl_langinfo' function. */
+#define HAVE_NL_LANGINFO 1
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `open' function. */
+#define HAVE_OPEN 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+/* #undef HAVE_PROCESSOR_CACHE_TYPE */
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+/* #undef HAVE_PROCESSOR_GROUP_INFO */
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_PROCESSOR_RELATIONSHIP */
+
+/* Define to '1' if program_invocation_name is present and usable */
+#define HAVE_PROGRAM_INVOCATION_NAME 1
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+   */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+#define HAVE_PTHREAD_T 1
+
+/* Define to 1 if you have the `putwc' function. */
+#define HAVE_PUTWC 1
+
+/* Define to 1 if you have the `read' function. */
+#define HAVE_READ 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_yield' function. */
+#define HAVE_SCHED_YIELD 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL 1
+
+/* Define to 1 if gcc's __sync builtins are available */
+#define HAVE_SYNC_BUILTINS 1
+
+/* Define to '1' if sysctl is present and usable */
+#define HAVE_SYSCTL 1
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the type 'unsigned long long int'. */
+#define HAVE_UNSIGNED_LONG_LONG_INT 1
+
+/* Define to 1 if you have the `uselocale' function. */
+#define HAVE_USELOCALE 1
+
+/* Define to 1 if the system has the type `wchar_t'. */
+#define HAVE_WCHAR_T 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+#define HAVE_X11_KEYSYM_H 1
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+#define HAVE_X11_XLIB_H 1
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+#define HAVE_X11_XUTIL_H 1
+
+/* Define to 1 if you have the <xlocale.h> header file. */
+#define HAVE_XLOCALE_H 1
+
+/* Define to '1' if __progname is present and usable */
+#define HAVE___PROGNAME 1
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 1
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+#define HWLOC_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define HWLOC_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define HWLOC_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define HWLOC_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define HWLOC_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `cairo' library. */
+#define HWLOC_HAVE_CAIRO 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Define to 1 if the CPU_SET_S macro works */
+#define HWLOC_HAVE_CPU_SET_S 1
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFS 1
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFSL 1
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if function `strncasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HWLOC_HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HWLOC_HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have a library providing the termcap interface */
+/* #undef HWLOC_HAVE_LIBTERMCAP */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+#define HWLOC_HAVE_LINUXPCI 1
+
+/* Define to 1 if mbind is available. */
+/* #undef HWLOC_HAVE_MBIND */
+
+/* Define to 1 if migrate_pages is available. */
+/* #undef HWLOC_HAVE_MIGRATE_PAGES */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+   sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+   */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+#define HWLOC_HAVE_PTHREAD_MUTEX 1
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if set_mempolicy is available. */
+/* #undef HWLOC_HAVE_SET_MEMPOLICY */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+#define HWLOC_HAVE_X11_KEYSYM 1
+
+/* Define to 1 if you have x86 cpuid */
+#define HWLOC_HAVE_X86_CPUID 1
+
+/* Define to 1 if the _syscall3 macro works */
+/* #undef HWLOC_HAVE__SYSCALL3 */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Define to 1 on Irix */
+/* #undef HWLOC_IRIX_SYS */
+
+/* Define to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* Define to 1 on OSF */
+/* #undef HWLOC_OSF_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#ifdef __x86_64
+#define HWLOC_SIZEOF_UNSIGNED_LONG 8
+#else
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
+#define HWLOC_SIZEOF_UNSIGNED_LONG 4
+#endif
+#endif
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX likwid_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS LIKWID_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 if ncurses works, preferred over curses */
+/* #undef HWLOC_USE_NCURSES */
+
+/* The library version, always available, even in embedded mode, contrary to
+   VERSION */
+#define HWLOC_VERSION "2.0.0a1-git"
+
+/* Define to 1 on WINDOWS */
+/* #undef HWLOC_WIN_SYS */
+
+
+/* Define to 1 on x86_64 */
+#ifdef __x86_64
+#define HWLOC_X86_64_ARCH 1
+#else
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
+/* Define to 1 on x86_32 */
+#define HWLOC_X86_32_ARCH 1
+#endif
+#endif
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "hwloc"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/projects/hwloc/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "hwloc"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "hwloc 2.0.0a1-git"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "hwloc"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.0.0a1-git"
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define to 1 if /dev/urandom should be used for seeding the hash function */
+#define USE_URANDOM 1
+
+/* Define to 1 if CryptGenRandom should be used for seeding the hash function
+   */
+#define USE_WINDOWS_CRYPTOAPI 1
+
+/* Version number of package */
+#define VERSION "2.0.0a1-git"
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Are we building for HP-UX? */
+#define _HPUX_SOURCE 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t pid_t
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t pthread_t
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef int32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+
+#endif /* HWLOC_CONFIGURE_H */
+
diff --git a/ext/hwloc/include/private/components.h b/ext/hwloc/include/private/components.h
new file mode 100644
index 0000000..b366345
--- /dev/null
+++ b/ext/hwloc/include/private/components.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2012 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef PRIVATE_COMPONENTS_H
+#define PRIVATE_COMPONENTS_H 1
+
+#include <hwloc/plugins.h>
+
+struct hwloc_topology;
+
+extern int hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+					     int envvar_forced, /* 1 if forced through envvar, 0 if forced through API */
+					     int type, const char *name,
+					     const void *data1, const void *data2, const void *data3);
+extern void hwloc_disc_components_enable_others(struct hwloc_topology *topology);
+
+/* Compute the topology is_thissystem flag based on enabled backends */
+extern void hwloc_backends_is_thissystem(struct hwloc_topology *topology);
+
+/* Disable and destroy all backends used by a topology */
+extern void hwloc_backends_disable_all(struct hwloc_topology *topology);
+
+/* Used by the core to setup/destroy the list of components */
+extern void hwloc_components_init(struct hwloc_topology *topology); /* increases components refcount, should be called exactly once per topology (during init) */
+extern void hwloc_components_destroy_all(struct hwloc_topology *topology); /* decreases components refcount, should be called exactly once per topology (during destroy) */
+
+#endif /* PRIVATE_COMPONENTS_H */
+
diff --git a/ext/hwloc/include/private/cpuid-x86.h b/ext/hwloc/include/private/cpuid-x86.h
new file mode 100644
index 0000000..8a8c48e
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid-x86.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2010-2012, 2014 Université Bordeaux
+ * Copyright © 2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2014 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid.  */
+
+#ifndef HWLOC_PRIVATE_CPUID_X86_H
+#define HWLOC_PRIVATE_CPUID_X86_H
+
+#if (defined HWLOC_X86_32_ARCH) && (!defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void)
+{
+  int ret;
+  unsigned tmp, tmp2;
+  __asm__(
+      "mov $0,%0\n\t"   /* Not supported a priori */
+
+      "pushfl   \n\t"   /* Save flags */
+
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"   /* Get flags */                         \
+
+#define TRY_TOGGLE                                              \
+      "xor $0x00200000,%1\n\t"        /* Try to toggle ID */    \
+      "mov %1,%2\n\t"   /* Save expected value */               \
+      "push %1  \n\t"                                           \
+      "popfl    \n\t"   /* Try to toggle */                     \
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"                                           \
+      "cmp %1,%2\n\t"   /* Compare with expected value */       \
+      "jnz 0f\n\t"   /* Unexpected, failure */               \
+
+      TRY_TOGGLE        /* Try to set/clear */
+      TRY_TOGGLE        /* Try to clear/set */
+
+      "mov $1,%0\n\t"   /* Passed the test! */
+
+      "0: \n\t"
+      "popfl    \n\t"   /* Restore flags */
+
+      : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+  return ret;
+}
+#endif /* !defined HWLOC_X86_32_ARCH && !defined HWLOC_HAVE_MSVC_CPUIDEX*/
+#if (defined HWLOC_X86_64_ARCH) || (defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_x86_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+#ifdef HWLOC_HAVE_MSVC_CPUIDEX
+  int regs[4];
+  __cpuidex(regs, *eax, *ecx);
+  *eax = regs[0];
+  *ebx = regs[1];
+  *ecx = regs[2];
+  *edx = regs[3];
+#else /* HWLOC_HAVE_MSVC_CPUIDEX */
+  /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+   * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+  hwloc_uint64_t sav_rbx;
+  __asm__(
+  "mov %%rbx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%rbx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+    "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+  unsigned long sav_ebx;
+  __asm__(
+  "mov %%ebx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%ebx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+    "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+#endif /* HWLOC_HAVE_MSVC_CPUIDEX */
+}
+
+#endif /* HWLOC_PRIVATE_X86_CPUID_H */
diff --git a/ext/hwloc/include/private/cpuid.h b/ext/hwloc/include/private/cpuid.h
new file mode 100644
index 0000000..214ab38
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2010-2012 Université Bordeaux 1
+ * Copyright © 2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2014 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid.  */
+
+#ifndef HWLOC_PRIVATE_CPUID_H
+#define HWLOC_PRIVATE_CPUID_H
+
+#ifdef HWLOC_X86_32_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void)
+{
+  int ret;
+  unsigned tmp, tmp2;
+  asm(
+      "mov $0,%0\n\t"   /* Not supported a priori */
+
+      "pushfl   \n\t"   /* Save flags */
+
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"   /* Get flags */                         \
+
+#define TRY_TOGGLE                                              \
+      "xor $0x00200000,%1\n\t"        /* Try to toggle ID */    \
+      "mov %1,%2\n\t"   /* Save expected value */               \
+      "push %1  \n\t"                                           \
+      "popfl    \n\t"   /* Try to toggle */                     \
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"                                           \
+      "cmp %1,%2\n\t"   /* Compare with expected value */       \
+      "jnz Lhwloc1\n\t"   /* Unexpected, failure */               \
+
+      TRY_TOGGLE        /* Try to set/clear */
+      TRY_TOGGLE        /* Try to clear/set */
+
+      "mov $1,%0\n\t"   /* Passed the test! */
+
+      "Lhwloc1: \n\t"
+      "popfl    \n\t"   /* Restore flags */
+
+      : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+  return ret;
+}
+#endif /* HWLOC_X86_32_ARCH */
+#ifdef HWLOC_X86_64_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+  /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+   * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+  hwloc_uint64_t sav_rbx;
+  asm(
+  "mov %%rbx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%rbx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+    "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+  unsigned long sav_ebx;
+  asm(
+  "mov %%ebx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%ebx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+    "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_CPUID_H */
diff --git a/ext/hwloc/include/private/debug.h b/ext/hwloc/include/private/debug.h
new file mode 100644
index 0000000..4de91bf
--- /dev/null
+++ b/ext/hwloc/include/private/debug.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2012 Inria.  All rights reserved.
+ * Copyright © 2009, 2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_DEBUG_H
+#define HWLOC_DEBUG_H
+
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_DEBUG
+#include <stdarg.h>
+#include <stdio.h>
+#endif
+
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...)
+{
+#ifdef HWLOC_DEBUG
+    va_list ap;
+
+    va_start(ap, s);
+    vfprintf(stderr, s, ap);
+    va_end(ap);
+#endif
+}
+
+#ifdef HWLOC_DEBUG
+#define hwloc_debug_bitmap(fmt, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, s); \
+  free(s); \
+} while (0)
+#define hwloc_debug_1arg_bitmap(fmt, arg1, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, s); \
+  free(s); \
+} while (0)
+#define hwloc_debug_2args_bitmap(fmt, arg1, arg2, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, arg2, s); \
+  free(s); \
+} while (0)
+#else
+#define hwloc_debug_bitmap(s, bitmap) do { } while(0)
+#define hwloc_debug_1arg_bitmap(s, arg1, bitmap) do { } while(0)
+#define hwloc_debug_2args_bitmap(s, arg1, arg2, bitmap) do { } while(0)
+#endif
+
+#endif /* HWLOC_DEBUG_H */
diff --git a/ext/hwloc/include/private/map.h b/ext/hwloc/include/private/map.h
new file mode 100644
index 0000000..77c18a5
--- /dev/null
+++ b/ext/hwloc/include/private/map.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2013 Inria.  All rights reserved.
+ * Copyright © 2013 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2013-2014 University of Wisconsin-La Crosse.
+ *                         All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ *
+ * $HEADER$
+ */
+
+#ifndef _PRIVATE_NETLOC_MAP_H_
+#define _PRIVATE_NETLOC_MAP_H_
+
+#include <hwloc.h>
+#include <netloc.h>
+
+
+struct netloc_map__subnet;
+struct netloc_map__server;
+
+struct netloc_map__port {
+  struct netloc_map__subnet * subnet;
+  struct netloc_map__server * server;
+
+  netloc_edge_t * edge;
+
+  unsigned hwloc_obj_depth;
+  unsigned hwloc_obj_index;
+  hwloc_obj_t hwloc_obj; /* cached from depth/index above,
+			  * only non-NULL if the topology hasn't been compressed in the meantime.
+			  */
+
+  struct netloc_map__port *prev, *next;
+
+  char id[0];
+};
+
+struct netloc_map__subnet {
+  netloc_topology_t topology;
+  netloc_network_type_t type;
+
+  int port_by_id_ready;
+  struct netloc_dt_lookup_table port_by_id;
+
+  struct netloc_map__subnet *prev, *next;
+
+  struct netloc_map__port *port_first, *port_last;
+  unsigned ports_nr;
+
+  char id[0];
+};
+
+struct netloc_map__server {
+  hwloc_topology_t topology; /* NULL if compressed */
+#if HWLOC_API_VERSION >= 0x00010800
+  hwloc_topology_diff_t topology_diff;
+  struct netloc_map__server *topology_diff_refserver;
+#endif
+
+  int usecount; /* references from the application,
+		 * or from topology diff for other servers.
+		 * no compression when > 0
+		 */
+
+  unsigned nr_ports;
+  unsigned nr_ports_allocated;
+  struct netloc_map__port ** ports;
+
+  struct netloc_map__server *prev, *next;
+  struct netloc_map *map;
+
+  char name[0];
+};
+
+enum netloc_map_verbose_flags_e {
+  NETLOC_MAP_VERBOSE_FLAG_COMPRESS = (1<<0)
+};
+
+struct netloc_map {
+  unsigned long flags;
+  unsigned long verbose_flags;
+
+  unsigned server_ports_nr; /* needed during build, to create large-enough hash tables */
+
+  char *hwloc_xml_path;
+  struct netloc_dt_lookup_table server_by_name;
+  struct netloc_map__server *server_first, *server_last;
+  unsigned servers_nr;
+
+  char *netloc_data_path;
+  struct netloc_dt_lookup_table subnet_by_id[NETLOC_NETWORK_TYPE_INVALID]; /* enough room for existing types */
+  struct netloc_map__subnet *subnet_first, *subnet_last;
+  unsigned subnets_nr;
+
+  int merged;
+};
+
+struct netloc_map__paths {
+  struct netloc_map *map;
+  unsigned long flags;
+  unsigned nr_paths;
+  struct netloc_map__path {
+    /* FIXME: cache the subnet */
+    unsigned nr_edges;
+    struct netloc_map_edge_s *edges;
+  } * paths;
+};
+
+#endif /* _PRIVATE_NETLOC_MAP_H_ */
diff --git a/ext/hwloc/include/private/misc.h b/ext/hwloc/include/private/misc.h
new file mode 100644
index 0000000..d0e6a46
--- /dev/null
+++ b/ext/hwloc/include/private/misc.h
@@ -0,0 +1,382 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Misc macros and inlines.  */
+
+#ifndef HWLOC_PRIVATE_MISC_H
+#define HWLOC_PRIVATE_MISC_H
+
+#include <hwloc/autogen/config.h>
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#else
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+#endif
+
+/* Compile-time assertion */
+#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
+
+#define HWLOC_BITS_PER_LONG (HWLOC_SIZEOF_UNSIGNED_LONG * 8)
+#define HWLOC_BITS_PER_INT (HWLOC_SIZEOF_UNSIGNED_INT * 8)
+
+#if (HWLOC_BITS_PER_LONG != 32) && (HWLOC_BITS_PER_LONG != 64)
+#error "unknown size for unsigned long."
+#endif
+
+#if (HWLOC_BITS_PER_INT != 16) && (HWLOC_BITS_PER_INT != 32) && (HWLOC_BITS_PER_INT != 64)
+#error "unknown size for unsigned int."
+#endif
+
+
+/**
+ * ffsl helpers.
+ */
+
+#if defined(HWLOC_HAVE_BROKEN_FFS)
+
+/* System has a broken ffs().
+ * We must check the before __GNUC__ or HWLOC_HAVE_FFSL
+ */
+#    define HWLOC_NO_FFS
+
+#elif defined(__GNUC__)
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+     /* Starting from 3.4, gcc has a long variant.  */
+#    define hwloc_ffsl(x) __builtin_ffsl(x)
+#  else
+#    define hwloc_ffs(x) __builtin_ffs(x)
+#    define HWLOC_NEED_FFSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FFSL)
+
+#  ifndef HWLOC_HAVE_DECL_FFSL
+extern int ffsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffsl(x) ffsl(x)
+
+#elif defined(HWLOC_HAVE_FFS)
+
+#  ifndef HWLOC_HAVE_DECL_FFS
+extern int ffs(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffs(x) ffs(x)
+#  define HWLOC_NEED_FFSL
+
+#else /* no ffs implementation */
+
+#    define HWLOC_NO_FFS
+
+#endif
+
+#ifdef HWLOC_NO_FFS
+
+/* no ffs or it is known to be broken */
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x)
+{
+	int i;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if (!(x & 0xfffffffful)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if (!(x & 0xffffu)) {
+		x >>= 16;
+		i += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		i += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		i += 4;
+	}
+	if (!(x & 0x3)) {
+		x >>= 2;
+		i += 2;
+	}
+	if (!(x & 0x1)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_manual
+
+#elif defined(HWLOC_NEED_FFSL)
+
+/* We only have an int ffs(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs(x & 0xfffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs(x >> 16);
+	if (hi_ffs)
+		return hi_ffs + 16;
+
+	return 0;
+#else
+	return hwloc_ffs(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs32(x & 0xfffffffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs32(x >> 32);
+	if (hi_ffs)
+		return hi_ffs + 32;
+
+	return 0;
+#else
+	return hwloc_ffs32(x);
+#endif
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_from_ffs32
+
+#endif
+
+/**
+ * flsl helpers.
+ */
+#ifdef __GNUC_____
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+#    define hwloc_flsl(x) (x ? 8*sizeof(long) - __builtin_clzl(x) : 0)
+#  else
+#    define hwloc_fls(x) (x ? 8*sizeof(int) - __builtin_clz(x) : 0)
+#    define HWLOC_NEED_FLSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FLSL)
+
+#  ifndef HWLOC_HAVE_DECL_FLSL
+extern int flsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) flsl(x)
+
+#elif defined(HWLOC_HAVE_CLZL)
+
+#  ifndef HWLOC_HAVE_DECL_CLZL
+extern int clzl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) (x ? 8*sizeof(long) - clzl(x) : 0)
+
+#elif defined(HWLOC_HAVE_FLS)
+
+#  ifndef HWLOC_HAVE_DECL_FLS
+extern int fls(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) fls(x)
+#  define HWLOC_NEED_FLSL
+
+#elif defined(HWLOC_HAVE_CLZ)
+
+#  ifndef HWLOC_HAVE_DECL_CLZ
+extern int clz(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) (x ? 8*sizeof(int) - clz(x) : 0)
+#  define HWLOC_NEED_FLSL
+
+#else /* no fls implementation */
+
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x)
+{
+	int i = 0;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if ((x & 0xffffffff00000000ul)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if ((x & 0xffff0000u)) {
+		x >>= 16;
+		i += 16;
+	}
+	if ((x & 0xff00)) {
+		x >>= 8;
+		i += 8;
+	}
+	if ((x & 0xf0)) {
+		x >>= 4;
+		i += 4;
+	}
+	if ((x & 0xc)) {
+		x >>= 2;
+		i += 2;
+	}
+	if ((x & 0x2)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_manual
+
+#endif
+
+#ifdef HWLOC_NEED_FLSL
+
+/* We only have an int fls(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls(x >> 16);
+	if (hi_fls)
+		return hi_fls + 16;
+
+	low_fls = hwloc_fls(x & 0xfffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls32(x >> 32);
+	if (hi_fls)
+		return hi_fls + 32;
+
+	low_fls = hwloc_fls32(x & 0xfffffffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls32(x);
+#endif
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_from_fls32
+
+#endif
+
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w)
+{
+#if HWLOC_BITS_PER_LONG == 32
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcount(w);
+#else
+	unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+	res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+	return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+#endif
+#else /* HWLOC_BITS_PER_LONG == 32 */
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcountll(w);
+#else
+	unsigned long res;
+	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+#endif /* HWLOC_BITS_PER_LONG == 64 */
+}
+
+#if !HAVE_DECL_STRTOULL
+unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+#endif
+
+static __hwloc_inline int hwloc_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+  return strncasecmp(s1, s2, n);
+#else
+  while (n) {
+    char c1 = tolower(*s1), c2 = tolower(*s2);
+    if (!c1 || !c2 || c1 != c2)
+      return c1-c2;
+    n--; s1++; s2++;
+  }
+  return 0;
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_MISC_H */
diff --git a/ext/hwloc/include/private/private.h b/ext/hwloc/include/private/private.h
new file mode 100644
index 0000000..fa344ac
--- /dev/null
+++ b/ext/hwloc/include/private/private.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright © 2009      CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internal types and helpers. */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_H
+#define HWLOC_PRIVATE_H
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/bitmap.h>
+#include <private/components.h>
+#include <private/debug.h>
+#include <sys/types.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <string.h>
+
+enum hwloc_ignore_type_e {
+  HWLOC_IGNORE_TYPE_NEVER = 0,
+  HWLOC_IGNORE_TYPE_KEEP_STRUCTURE,
+  HWLOC_IGNORE_TYPE_ALWAYS
+};
+
+#define HWLOC_DEPTH_MAX 128
+
+struct hwloc_topology {
+  unsigned nb_levels;					/* Number of horizontal levels */
+  unsigned next_group_depth;				/* Depth of the next Group object that we may create */
+  unsigned level_nbobjects[HWLOC_DEPTH_MAX]; 		/* Number of objects on each horizontal level */
+  struct hwloc_obj **levels[HWLOC_DEPTH_MAX];		/* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
+  unsigned long flags;
+  int type_depth[HWLOC_OBJ_TYPE_MAX];
+  enum hwloc_ignore_type_e ignored_types[HWLOC_OBJ_TYPE_MAX];
+  int is_thissystem;
+  int is_loaded;
+  int modified;                                         /* >0 if objects were added/removed recently, which means a reconnect is needed */
+  hwloc_pid_t pid;                                      /* Process ID the topology is view from, 0 for self */
+  void *userdata;
+
+  unsigned bridge_nbobjects;
+  struct hwloc_obj **bridge_level;
+  struct hwloc_obj *first_bridge, *last_bridge;
+  unsigned pcidev_nbobjects;
+  struct hwloc_obj **pcidev_level;
+  struct hwloc_obj *first_pcidev, *last_pcidev;
+  unsigned osdev_nbobjects;
+  struct hwloc_obj **osdev_level;
+  struct hwloc_obj *first_osdev, *last_osdev;
+  unsigned misc_nbobjects;
+  struct hwloc_obj **misc_level;
+  struct hwloc_obj *first_misc, *last_misc;
+
+  struct hwloc_binding_hooks {
+    int (*set_thisproc_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisproc_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_thisthread_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisthread_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+    int (*get_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+#ifdef hwloc_thread_t
+    int (*set_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_cpuset_t set, int flags);
+    int (*get_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_cpuset_t set, int flags);
+#endif
+
+    int (*get_thisproc_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_thisthread_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_proc_last_cpu_location)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+    int (*set_thisproc_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisproc_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_thisthread_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisthread_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    /* This has to return the same kind of pointer as alloc_membind, so that free_membind can be used on it */
+    void *(*alloc)(hwloc_topology_t topology, size_t len);
+    /* alloc_membind has to always succeed if !(flags & HWLOC_MEMBIND_STRICT).
+     * see hwloc_alloc_or_fail which is convenient for that.  */
+    void *(*alloc_membind)(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*free_membind)(hwloc_topology_t topology, void *addr, size_t len);
+  } binding_hooks;
+
+  struct hwloc_topology_support support;
+
+  void (*userdata_export_cb)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj);
+  void (*userdata_import_cb)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length);
+
+  struct hwloc_os_distances_s {
+    hwloc_obj_type_t type;
+    int nbobjs;
+    unsigned *indexes; /* array of OS indexes before we can convert them into objs. always available.
+			*/
+    struct hwloc_obj **objs; /* array of objects, in the same order as above.
+			      * either given (by a backend) together with the indexes array above.
+			      * or build from the above indexes array when not given (by the user).
+			      */
+    float *distances; /* distance matrices, ordered according to the above indexes/objs array.
+		       * distance from i to j is stored in slot i*nbnodes+j.
+		       * will be copied into the main logical-index-ordered distance at the end of the discovery.
+		       */
+    int forced; /* set if the user forced a matrix to ignore the OS one */
+
+    struct hwloc_os_distances_s *prev, *next;
+  } *first_osdist, *last_osdist;
+
+  /* list of enabled backends. */
+  struct hwloc_backend * backends;
+};
+
+extern void hwloc_alloc_obj_cpusets(hwloc_obj_t obj);
+extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
+extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
+extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
+extern unsigned hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
+extern void hwloc_connect_children(hwloc_obj_t obj);
+extern int hwloc_connect_levels(hwloc_topology_t topology);
+
+extern int hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2);
+extern void hwloc__reorder_children(hwloc_obj_t parent);
+
+extern void hwloc_topology_setup_defaults(struct hwloc_topology *topology);
+extern void hwloc_topology_clear(struct hwloc_topology *topology);
+
+extern void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value);
+extern char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name);
+extern void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_obj_info_s **src_infosp, unsigned *src_countp);
+extern void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count);
+
+/* set native OS binding hooks */
+extern void hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support);
+/* set either native OS binding hooks (if thissystem), or dummy ones */
+extern void hwloc_set_binding_hooks(struct hwloc_topology *topology);
+
+#if defined(HWLOC_LINUX_SYS)
+extern void hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_LINUX_SYS */
+
+#if defined(HWLOC_BGQ_SYS)
+extern void hwloc_set_bgq_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_BGQ_SYS */
+
+#ifdef HWLOC_SOLARIS_SYS
+extern void hwloc_set_solaris_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_SOLARIS_SYS */
+
+#ifdef HWLOC_AIX_SYS
+extern void hwloc_set_aix_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_AIX_SYS */
+
+#ifdef HWLOC_OSF_SYS
+extern void hwloc_set_osf_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_OSF_SYS */
+
+#ifdef HWLOC_WIN_SYS
+extern void hwloc_set_windows_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_WIN_SYS */
+
+#ifdef HWLOC_DARWIN_SYS
+extern void hwloc_set_darwin_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_DARWIN_SYS */
+
+#ifdef HWLOC_FREEBSD_SYS
+extern void hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_FREEBSD_SYS */
+
+#ifdef HWLOC_NETBSD_SYS
+extern void hwloc_set_netbsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_NETBSD_SYS */
+
+#ifdef HWLOC_HPUX_SYS
+extern void hwloc_set_hpux_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_HPUX_SYS */
+
+/* Insert uname-specific names/values in the object infos array.
+ * If cached_uname isn't NULL, it is used as a struct utsname instead of recalling uname.
+ * Any field that starts with \0 is ignored.
+ */
+extern void hwloc_add_uname_info(struct hwloc_topology *topology, void *cached_uname);
+
+/* Free obj and its attributes assuming it doesn't have any children/parent anymore */
+extern void hwloc_free_unlinked_object(hwloc_obj_t obj);
+
+/* Duplicate src and its children under newparent in newtopology */
+extern void hwloc__duplicate_objects(struct hwloc_topology *newtopology, struct hwloc_obj *newparent, struct hwloc_obj *src);
+
+/* This can be used for the alloc field to get allocated data that can be freed by free() */
+void *hwloc_alloc_heap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the alloc field to get allocated data that can be freed by munmap() */
+void *hwloc_alloc_mmap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the free_membind field to free data using free() */
+int hwloc_free_heap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* This can be used for the free_membind field to free data using munmap() */
+int hwloc_free_mmap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* Allocates unbound memory or fail, depending on whether STRICT is requested
+ * or not */
+static __hwloc_inline void *
+hwloc_alloc_or_fail(hwloc_topology_t topology, size_t len, int flags)
+{
+  if (flags & HWLOC_MEMBIND_STRICT)
+    return NULL;
+  return hwloc_alloc(topology, len);
+}
+
+extern void hwloc_distances_init(struct hwloc_topology *topology);
+extern void hwloc_distances_destroy(struct hwloc_topology *topology);
+extern void hwloc_distances_set(struct hwloc_topology *topology, hwloc_obj_type_t type, unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances, int force);
+extern void hwloc_distances_set_from_env(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict_os(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags);
+extern void hwloc_distances_finalize_os(struct hwloc_topology *topology);
+extern void hwloc_distances_finalize_logical(struct hwloc_topology *topology);
+extern void hwloc_clear_object_distances(struct hwloc_obj *obj);
+extern void hwloc_clear_object_distances_one(struct hwloc_distances_s *distances);
+extern void hwloc_group_by_distances(struct hwloc_topology *topology);
+
+#ifdef HAVE_USELOCALE
+#include "locale.h"
+#ifdef HAVE_XLOCALE_H
+#include "xlocale.h"
+#endif
+#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
+#define hwloc_localeswitch_init() do {                     \
+  __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
+  if (__new_locale != (locale_t)0)                         \
+    __old_locale = uselocale(__new_locale);                \
+} while (0)
+#define hwloc_localeswitch_fini() do { \
+  if (__new_locale != (locale_t)0) {   \
+    uselocale(__old_locale);           \
+    freelocale(__new_locale);          \
+  }                                    \
+} while(0)
+#else /* HAVE_USELOCALE */
+#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
+#define hwloc_localeswitch_init()
+#define hwloc_localeswitch_fini()
+#endif /* HAVE_USELOCALE */
+
+#if !HAVE_DECL_FABSF
+#define fabsf(f) fabs((double)(f))
+#endif
+
+#if HAVE_DECL__SC_PAGE_SIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
+#elif HAVE_DECL__SC_PAGESIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
+#elif defined HAVE_GETPAGESIZE
+#define hwloc_getpagesize() getpagesize()
+#else
+#undef hwloc_getpagesize
+#endif
+
+/* encode src buffer into target buffer.
+ * targsize must be at least 4*((srclength+2)/3)+1.
+ * target will be 0-terminated.
+ */
+extern int hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize);
+/* decode src buffer into target buffer.
+ * src is 0-terminated.
+ * targsize must be at least srclength*3/4+1 (srclength not including \0)
+ * but only srclength*3/4 characters will be meaningful
+ * (the next one may be partially written during decoding, but it should be ignored).
+ */
+extern int hwloc_decode_from_base64(char const *src, char *target, size_t targsize);
+
+/* Check whether needle matches the beginning of haystack, at least n, and up
+ * to a colon or \0 */
+extern int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n);
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_FORMAT
+# if HWLOC_HAVE_ATTRIBUTE_FORMAT
+#  define __hwloc_attribute_format(type, str, arg)  __attribute__((__format__(type, str, arg)))
+# else
+#  define __hwloc_attribute_format(type, str, arg)
+# endif
+#else
+# define __hwloc_attribute_format(type, str, arg)
+#endif
+
+#define hwloc_memory_size_printf_value(_size, _verbose) \
+  ((_size) < (10ULL<<20) || _verbose ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
+#define hwloc_memory_size_printf_unit(_size, _verbose) \
+  ((_size) < (10ULL<<20) || _verbose ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
+
+/* On some systems, snprintf returns the size of written data, not the actually
+ * required size.  hwloc_snprintf always report the actually required size. */
+extern int hwloc_snprintf(char *str, size_t size, const char *format, ...) __hwloc_attribute_format(printf, 3, 4);
+
+extern void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup);
+
+/* Return the name of the currently running program, if supported.
+ * If not NULL, must be freed by the caller.
+ */
+extern char * hwloc_progname(struct hwloc_topology *topology);
+
+#define HWLOC_BITMAP_EQUAL 0       /* Bitmaps are equal */
+#define HWLOC_BITMAP_INCLUDED 1    /* First bitmap included in second */
+#define HWLOC_BITMAP_CONTAINS 2    /* First bitmap contains second */
+#define HWLOC_BITMAP_INTERSECTS 3  /* Bitmaps intersect without any inclusion */
+#define HWLOC_BITMAP_DIFFERENT  4  /* Bitmaps do not intersect */
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+#endif /* HWLOC_PRIVATE_H */
diff --git a/ext/hwloc/include/private/solaris-chiptype.h b/ext/hwloc/include/private/solaris-chiptype.h
new file mode 100644
index 0000000..4af80d8
--- /dev/null
+++ b/ext/hwloc/include/private/solaris-chiptype.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+#define HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+
+/* SPARC Chip Modes. */
+#define MODE_UNKNOWN            0
+#define MODE_SPITFIRE           1
+#define MODE_BLACKBIRD          2
+#define MODE_CHEETAH            3
+#define MODE_SPARC64_VI         4
+#define MODE_T1                 5
+#define MODE_T2                 6
+#define MODE_SPARC64_VII        7
+#define MODE_ROCK               8
+
+/* SPARC Chip Implementations. */
+#define IMPL_SPARC64_VI         0x6
+#define IMPL_SPARC64_VII        0x7
+#define IMPL_SPITFIRE           0x10
+#define IMPL_BLACKBIRD          0x11
+#define IMPL_SABRE              0x12
+#define IMPL_HUMMINGBIRD        0x13
+#define IMPL_CHEETAH            0x14
+#define IMPL_CHEETAHPLUS        0x15
+#define IMPL_JALAPENO           0x16
+#define IMPL_JAGUAR             0x18
+#define IMPL_PANTHER            0x19
+#define IMPL_NIAGARA            0x23
+#define IMPL_NIAGARA_2          0x24
+#define IMPL_ROCK               0x25
+
+/* Default Mfg, Cache, Speed settings */
+#define TI_MANUFACTURER         0x17
+#define TWO_MEG_CACHE           2097152
+#define SPITFIRE_SPEED          142943750
+
+char* hwloc_solaris_get_chip_type(void);
+char* hwloc_solaris_get_chip_model(void);
+
+#endif /* HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H */
diff --git a/ext/hwloc/include/private/xml.h b/ext/hwloc/include/private/xml.h
new file mode 100644
index 0000000..75c6c43
--- /dev/null
+++ b/ext/hwloc/include/private/xml.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef PRIVATE_XML_H
+#define PRIVATE_XML_H 1
+
+#include <hwloc.h>
+
+#include <sys/types.h>
+
+HWLOC_DECLSPEC int hwloc__xml_verbose(void);
+
+/**************
+ * XML import *
+ **************/
+
+typedef struct hwloc__xml_import_state_s {
+  struct hwloc__xml_import_state_s *parent;
+
+  /* globals shared because the entire stack of states during import */
+  struct hwloc_xml_backend_data_s *global;
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[32];
+} * hwloc__xml_import_state_t;
+
+HWLOC_DECLSPEC int hwloc__xml_import_diff(hwloc__xml_import_state_t state, hwloc_topology_diff_t *firstdiffp);
+
+struct hwloc_xml_backend_data_s {
+  /* xml backend parameters */
+  int (*look_init)(struct hwloc_xml_backend_data_s *bdata, struct hwloc__xml_import_state_s *state);
+  void (*look_failed)(struct hwloc_xml_backend_data_s *bdata);
+  void (*backend_exit)(struct hwloc_xml_backend_data_s *bdata);
+  int (*next_attr)(struct hwloc__xml_import_state_s * state, char **namep, char **valuep);
+  int (*find_child)(struct hwloc__xml_import_state_s * state, struct hwloc__xml_import_state_s * childstate, char **tagp);
+  int (*close_tag)(struct hwloc__xml_import_state_s * state); /* look for an explicit closing tag </name> */
+  void (*close_child)(struct hwloc__xml_import_state_s * state);
+  int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length);
+  void (*close_content)(struct hwloc__xml_import_state_s * state);
+  char * msgprefix;
+  void *data; /* libxml2 doc, or nolibxml buffer */
+  int nbnumanodes;
+  struct hwloc_xml_imported_distances_s {
+    hwloc_obj_t root;
+    struct hwloc_distances_s distances;
+    struct hwloc_xml_imported_distances_s *prev, *next;
+  } *first_distances, *last_distances;
+};
+
+/**************
+ * XML export *
+ **************/
+
+typedef struct hwloc__xml_export_state_s {
+  struct hwloc__xml_export_state_s *parent;
+
+  void (*new_child)(struct hwloc__xml_export_state_s *parentstate, struct hwloc__xml_export_state_s *state, const char *name);
+  void (*new_prop)(struct hwloc__xml_export_state_s *state, const char *name, const char *value);
+  void (*add_content)(struct hwloc__xml_export_state_s *state, const char *buffer, size_t length);
+  void (*end_object)(struct hwloc__xml_export_state_s *state, const char *name);
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[40];
+} * hwloc__xml_export_state_t;
+
+HWLOC_DECLSPEC void hwloc__xml_export_object (hwloc__xml_export_state_t state, struct hwloc_topology *topology, struct hwloc_obj *obj);
+
+HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff);
+
+/******************
+ * XML components *
+ ******************/
+
+struct hwloc_xml_callbacks {
+  int (*backend_init)(struct hwloc_xml_backend_data_s *bdata, const char *xmlpath, const char *xmlbuffer, int xmlbuflen);
+  int (*export_file)(struct hwloc_topology *topology, const char *filename);
+  int (*export_buffer)(struct hwloc_topology *topology, char **xmlbuffer, int *buflen);
+  void (*free_buffer)(void *xmlbuffer);
+  int (*import_diff)(struct hwloc__xml_import_state_s *state, const char *xmlpath, const char *xmlbuffer, int xmlbuflen, hwloc_topology_diff_t *diff, char **refnamep);
+  int (*export_diff_file)(union hwloc_topology_diff_u *diff, const char *refname, const char *filename);
+  int (*export_diff_buffer)(union hwloc_topology_diff_u *diff, const char *refname, char **xmlbuffer, int *buflen);
+};
+
+struct hwloc_xml_component {
+  struct hwloc_xml_callbacks *nolibxml_callbacks;
+  struct hwloc_xml_callbacks *libxml_callbacks;
+};
+
+HWLOC_DECLSPEC void hwloc_xml_callbacks_register(struct hwloc_xml_component *component);
+HWLOC_DECLSPEC void hwloc_xml_callbacks_reset(void);
+
+#endif /* PRIVATE_XML_H */
diff --git a/ext/hwloc/include/static-components.h b/ext/hwloc/include/static-components.h
new file mode 100644
index 0000000..ad23185
--- /dev/null
+++ b/ext/hwloc/include/static-components.h
@@ -0,0 +1,17 @@
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_noos_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_synthetic_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_nolibxml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linux_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linuxpci_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_x86_component;
+static const struct hwloc_component * hwloc_static_components[] = {
+  &hwloc_noos_component,
+//  &hwloc_xml_component,
+  &hwloc_synthetic_component,
+//  &hwloc_xml_nolibxml_component,
+  &hwloc_linux_component,
+  &hwloc_linuxpci_component,
+  &hwloc_x86_component,
+  NULL
+};
diff --git a/filters/csv b/filters/csv
deleted file mode 100755
index 654f204..0000000
--- a/filters/csv
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-
-my $FILTERTYPE = 'csv';
-
-my $SEP = ',';
-my $NL = "\n";
-
-if ($#ARGV < 1) {
-    die "Filter failed! Please report bug.\n";
-}
-
-my $filename = $ARGV[0];
-my $fileType  = $ARGV[1];
-my $infile = $filename;
-
-open INFILE,"< $filename";
-$filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
-
-if ($fileType eq 'topology') {
-    my $region = 'topo';
-    print OUTFILE 'THREADS'.$NL;
-
-    while (<INFILE>) {
-
-        if (/Cache Topology/) {
-            $region = 'cache';
-            print OUTFILE 'CACHES'.$NL;
-        } elsif (/NUMA Topology/) {
-            $region = 'numa';
-            print OUTFILE 'NUMA'.$NL;
-        }
-
-        if ($region eq 'topo') {
-            if (/(CPU type):\t(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-            elsif (/([A-Za-z ]*):\t([0-9]*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            } elsif (/(HWThread)\t(Thread)\t\t(Core)\t\t(Socket)/) {
-                print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
-            } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
-                print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
-            }
-        } elsif ($region eq 'cache') {
-            if (/(Size):\t([0-9]*) ([kMB]*)/) {
-                my $size = $2;
-                if ($3 eq 'MB') {
-                    $size *= 1024;
-                }
-                print OUTFILE $1.'[kB]'.$SEP.$size.$NL;
-            } elsif (/(Cache groups):\t*(.*)/) {
-                my @groups = split('\) \(',$2);
-
-                my $grpId = 0;
-                foreach (@groups) {
-                    /([0-9 ]+)/;
-                    print OUTFILE 'Cache group '.$grpId.$SEP.$1.$NL;
-                    $grpId++;
-                }
-            } elsif (/(.*):\t*(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-        } elsif ($region eq 'numa') {
-            if (/Domain ([0-9]*)/) {
-                print OUTFILE 'Domain ID'.$SEP.$1.$NL;
-            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
-                print OUTFILE 'Free Memory [MB]'.$SEP.$1.$NL;
-                print OUTFILE 'Total Memory [MB]'.$SEP.$2.$NL;
-            } elsif (/(.*):\t*[ ]*(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-        }
-    }
-} elsif ($fileType eq 'perfctr') {
-    my $header = 0;
-    while (<INFILE>) {
-        if (/Event[ ]*\|[ ]*(core.*)\|/) {
-            if (not $header) {
-                my @col = split('\|',$1);
-                my $numcol = $#col+1;
-                print OUTFILE 'NumColumns'.$SEP.$numcol.$NL;
-                print OUTFILE 'Event/Metric';
-                foreach (@col) {
-                    s/[ ]//g;
-                    print OUTFILE $SEP.$_;
-                }
-                print OUTFILE $NL;
-                $header = 1;
-            }
-        }elsif (/STAT/) {
-
-        }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
-            my @col = split('\|',$2);
-            print OUTFILE $1;
-            foreach (@col) {
-                s/[ ]//g;
-                print OUTFILE $SEP.$_;
-            }
-            print OUTFILE $NL;
-        } 
-    }
-} else {
-    die "Filter failed! Unknown application type $fileType!\n";
-}
-
-unlink($infile);
-close INFILE;
-close OUTFILE;
-
-
diff --git a/filters/xml b/filters/xml
index b72c430..fa24a9d 100755
--- a/filters/xml
+++ b/filters/xml
@@ -15,62 +15,91 @@ my $filename = $ARGV[0];
 my $fileType  = $ARGV[1];
 my $infile = $filename;
 
-open INFILE,"< $filename";
+if (! -e $filename)
+{
+    die "Input file does not exist!\n";
+}
+
+open INFILE,"<$filename";
 $filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
+open OUTFILE,">$filename";
 
 
 if ($fileType eq 'topology') {
     my $region = 'topo';
     my $indomain = 0;
     print OUTFILE '<node>'.$NL;
+    print OUTFILE '<info>'.$NL;
 
     while (<INFILE>) {
-
-        if (/Cache Topology/) {
+        if (/STRUCT,Cache Topology L1/) {
             $region = 'cache';
             print OUTFILE '<caches>'.$NL;
-        } elsif (/NUMA Topology/) {
+        } elsif (/STRUCT,NUMA Topology/) {
             print OUTFILE '</caches>'.$NL;
             print OUTFILE '<numa>'.$NL;
             $region = 'numa';
         }
 
         if ($region eq 'topo') {
-            if (/(CPU type):\t([\w ]*)/) {
+            if (/(CPU type):,([\w ]*),/) {
                 print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
-            } elsif (/CPU clock:\t([\d.]) GHz/) {
+            } elsif (/CPU name:,([^,]+),/) {
+                print OUTFILE '<name>'.$1.'</name>'.$NL;
+            } elsif (/CPU stepping:,(\d+),/) {
+                print OUTFILE '<stepping>'.$1.'</stepping>'.$NL;
+            } elsif (/CPU clock:,([\d.]+) GHz/) {
                 print OUTFILE '<clock>'.$1.'</clock>'.$NL;
-            } elsif (/(Sockets):\t(\d*)/) {
+            } elsif (/(Sockets):,(\d+),/) {
                 print OUTFILE '<socketsPerNode>'.$2.'</socketsPerNode>'.$NL;
-            } elsif (/(Cores per socket):\t(\d*)/) {
+            } elsif (/(Cores per socket):,(\d+),/) {
                 print OUTFILE '<coresPerSocket>'.$2.'</coresPerSocket>'.$NL;
-            } elsif (/(Threads per core):\t(\d*)/) {
+            } elsif (/(Threads per core):,(\d+),/) {
                 print OUTFILE '<threadsPerCore>'.$2.'</threadsPerCore>'.$NL;
-            } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
+            } elsif (/HWThread,Thread,Core,Socket,Available/) {
+                print OUTFILE '</info>'.$NL;
+                print OUTFILE '<threads>'.$NL;
+            } elsif (/(\d+),(\d+),(\d+),(\d+),/) {
                 #TODO Build tree for XML output from table!
+                print OUTFILE '<thread>'.$NL;
+                print OUTFILE '<id>'.$1.'</id>'.$NL;
+                print OUTFILE '<threadid>'.$2.'</threadid>'.$NL;
+                print OUTFILE '<coreid>'.$3.'</coreid>'.$NL;
+                print OUTFILE '<socketid>'.$4.'</socketid>'.$NL;
+                print OUTFILE '</thread>'.$NL;
+            } elsif (/STRUCT,Sockets,/) {
+                print OUTFILE '</threads>'.$NL;
+                $region = 'cache';
             }
         } elsif ($region eq 'cache') {
-            if (/(Size):\t([0-9]*) ([kMB]*)/) {
+            if (/(Size):,(\d+) ([kMB]*)/) {
                 my $size = $2;
                 if ($3 eq 'MB') {
                     $size *= 1024;
                 }
                 print OUTFILE '<size>'.$size.'</size>'.$NL;
-            } elsif (/(Cache groups):\t*(.*)/) {
+            } elsif (/(Cache groups):,([\d ]+),/) {
                 print OUTFILE '</cache>'.$NL;
-            } elsif (/(Associativity):\t*(.*)/) {
+            } elsif (/Type:,(\w+) cache,/) {
+                print OUTFILE '<type>'.lc $1.'</type>'.$NL;
+            } elsif (/(Associativity):,(\d+)/) {
                 print OUTFILE '<associativity>'.$2.'</associativity>'.$NL;
-            } elsif (/(Number of sets):\t*(.*)/) {
+            } elsif (/(Number of sets):,(\d+)/) {
                 print OUTFILE '<sets>'.$2.'</sets>'.$NL;
-            } elsif (/(Cache line size):\t*(.*)/) {
+            } elsif (/(Cache line size):,(\d+)/) {
                 print OUTFILE '<linesize>'.$2.'</linesize>'.$NL;
-            } elsif (/(Level):\t*(.*)/) {
+            } elsif (/Shared by threads:,(\d+),/) {
+                print OUTFILE '<sharedby>'.$1.'</sharedby>'.$NL;
+            } elsif (/Cache type:,Inclusive/) {
+                print OUTFILE '<inclusive>true</inclusive>'.$NL;
+            } elsif (/Cache type:,Non Inclusive/) {
+                print OUTFILE '<inclusive>false</inclusive>'.$NL;
+            } elsif (/(Level):,(\d+)/) {
                 print OUTFILE '<cache>'.$NL;
                 print OUTFILE '<level>'.$2.'</level>'.$NL;
             }
         } elsif ($region eq 'numa') {
-            if (/Domain ([0-9]*)/) {
+            if (/Domain:,(\d+),/) {
                 if ($indomain )
                 {
                     print OUTFILE '</domain>'.$NL;
@@ -78,10 +107,11 @@ if ($fileType eq 'topology') {
                 print OUTFILE '<domain>'.$NL;
                 print OUTFILE '<id>'.$1.'</id>'.$NL;
                 $indomain = 1
-            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
+            } elsif (/Free memory:,([\d.]+) MB,/) {
                 print OUTFILE '<freememory>'.$1.'</freememory>'.$NL;
-                print OUTFILE '<totalmemory>'.$2.'</totalmemory>'.$NL;
-            } elsif (/Processors:[ ]+([0-9. ]+)/) {
+            } elsif (/Total memory:,([\d.]+) MB,/) {
+                print OUTFILE '<totalmemory>'.$1.'</totalmemory>'.$NL;
+            } elsif (/Processors:,([\d, ]+)/) {
                 print OUTFILE '<processors>'.$1.'</processors>'.$NL;
             }
         }
@@ -96,41 +126,105 @@ if ($fileType eq 'topology') {
 } elsif ($fileType eq 'perfctr') {
     my $header = 0;
     my @col;
+    my @cpus;
+    my $region = 'info';
+    my $group = "1";
     print OUTFILE '<perfctr>'.$NL;
     while (<INFILE>) {
-        if (/Event[ ]*\|[ ]*(core.*)\|/) {
-            if (not $header) {
-                @col = split('\|',$1);
-                foreach (@col) {
-                    s/core //g;
-                    s/[ ]//g;
+        if (/TABLE,Info/) {
+            $region = 'info';
+            print OUTFILE '<info>'.$NL;
+        } elsif (/TABLE,Group (\d+) Raw/) {
+            $group = $1;
+            if (/Stat/) {
+                $region = '';
+            } else {
+                $region = 'raw';
+                if ($region eq 'info') {
+                    print OUTFILE '</info>'.$NL;
                 }
-                $header = 1;
+                print OUTFILE '<group'.$group.'>'.$NL;
+                print OUTFILE '<rawvalues>'.$NL;
             }
-        }elsif (/STAT/) {
-
-        }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
-            my @rescol = split('\|',$2);
-            my $id = 0;
-            print OUTFILE '<result>'.$NL;
-            print OUTFILE '<event>'.$1.'</event>'.$NL;
-            foreach (@rescol) {
-                s/[ ]//g;
-                print OUTFILE '<core>'.$NL;
-                print OUTFILE '<id>'.$col[$id].'</id>'.$NL;
-                print OUTFILE '<value>'.$_.'</value>'.$NL;
-                print OUTFILE '</core>'.$NL;
-                $id++;
+        } elsif (/TABLE,Group (\d+) Metric/) {
+            $group = $1;
+            if (/Stat/) {
+                if ($region eq 'metric')
+                {
+                    print OUTFILE '</metrics>'.$NL;
+                    print OUTFILE '</group'.$group.'>'.$NL;
+                }
+                $region = '';
+            } else {
+                $region = 'metric';
+                print OUTFILE '</rawvalues>'.$NL;
+                print OUTFILE '<metrics>'.$NL;
             }
-            print OUTFILE '</result>'.$NL;
-        } 
+        }
+        if ($region eq 'info') {
+            if (/(CPU type):,([\w ]*),/) {
+                print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
+            } elsif (/CPU name:,([^,]+),/) {
+                print OUTFILE '<name>'.$1.'</name>'.$NL;
+            } elsif (/CPU clock:,([\d.]+) GHz/) {
+                print OUTFILE '<clock>'.$1.'</clock>'.$NL;
+            }
+        } elsif ($region eq 'raw') {
+            if (/Event,Counter,(.*)/) {
+                if (not $header) {
+                    @cpus = split(',',$1);
+                    foreach (@cpus) {
+                        s/Core //g;
+                        s/[ ]//g;
+                    }
+                    $header = 1;
+                }
+            } elsif (!/TABLE/) {
+                @col = split(',',$_);
+                print OUTFILE '<event>'.$NL;
+                print OUTFILE '<name>'.$col[0].'</name>'.$NL;
+                print OUTFILE '<counter>'.$col[1].'</counter>'.$NL;
+
+                
+                for (my $i=0; $i<@cpus; $i++) {
+                    
+                    print OUTFILE '<cpu'.$cpus[$i].'>'.$col[2+$i].'</cpu'.$cpus[$i].'>'.$NL;
+                }
+                print OUTFILE '</event>'.$NL;
+            }
+        } elsif ($region eq 'metric') {
+            if ((!/Metric,Core/) and (!/TABLE/)) {
+                @col = split(',',$_);
+                print OUTFILE '<metric>'.$NL;
+                my $name = "";
+                my $unit = "";
+                if ($col[0] =~ /\[.*\]/) {
+                    $col[0] =~ m/(.*)\s\[(.*)\]/;
+                    $name = $1;
+                    $unit = $2
+                } else {
+                    $name = $col[0]
+                }
+                print OUTFILE '<name>'.$name.'</name>'.$NL;
+                if ($unit ne "")
+                {
+                    print OUTFILE '<unit>'.$unit.'</unit>'.$NL;
+                }
+                for (my $i=0; $i<@cpus; $i++) {
+                    print OUTFILE '<cpu'.$cpus[$i].'>'.$col[1+$i].'</cpu'.$cpus[$i].'>'.$NL;
+                }
+                print OUTFILE '</metric>'.$NL;
+            }
+        } elsif (/STAT/) {
+
+        }
     }
     print OUTFILE '</perfctr>'.$NL;
 } else {
     die "Filter failed! Unknown application type $fileType!\n";
 }
 
-#unlink($infile);
+unlink($infile);
 close INFILE;
 close OUTFILE;
 
diff --git a/groups/atom/BRANCH.txt b/groups/atom/BRANCH.txt
index 51d2ddd..4213114 100644
--- a/groups/atom/BRANCH.txt
+++ b/groups/atom/BRANCH.txt
@@ -3,11 +3,14 @@ SHORT Branch prediction miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BR_INST_RETIRED_ANY
 PMC1  BR_INST_RETIRED_MISPRED
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Branch rate   PMC0/FIXC0
 Branch misprediction rate  PMC1/FIXC0
@@ -15,5 +18,14 @@ Branch misprediction ratio  PMC1/PMC0
 Instructions per branch  FIXC0/PMC0
 
 LONG
-Bla Bla
+Formulas:
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/atom/DATA.txt b/groups/atom/DATA.txt
index 1c0f4ae..9349354 100644
--- a/groups/atom/DATA.txt
+++ b/groups/atom/DATA.txt
@@ -3,14 +3,20 @@ SHORT Load to store ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_CACHE_LD
 PMC1  L1D_CACHE_ST
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
-Bla Bla
+Formulas:
+Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
+-
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/atom/FLOPS_DP.txt b/groups/atom/FLOPS_DP.txt
index 12905c6..8d966cc 100644
--- a/groups/atom/FLOPS_DP.txt
+++ b/groups/atom/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -9,9 +9,9 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
 METRICS
 Runtime [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
+DP MFLOP/s    1.0E-06*(PMC0*2.0+PMC1)/time
 
 
 LONG
-Double Precision MFlops/s Double Precision MFlops/s
+Double Precision MFLOP/s Double Precision MFLOP/s
 
diff --git a/groups/atom/FLOPS_SP.txt b/groups/atom/FLOPS_SP.txt
index f064f38..49ca1f3 100644
--- a/groups/atom/FLOPS_SP.txt
+++ b/groups/atom/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -9,8 +9,8 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
 METRICS
 Runtime [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
 
 LONG
-Single Precision MFlops/s Double Precision MFlops/s
+Single Precision MFLOP/s Double Precision MFLOP/s
 
diff --git a/groups/atom/FLOPS_X87.txt b/groups/atom/FLOPS_X87.txt
index ad14a4d..57d2d81 100644
--- a/groups/atom/FLOPS_X87.txt
+++ b/groups/atom/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -8,8 +8,8 @@ PMC0  X87_COMP_OPS_EXE_ANY_AR
 METRICS
 Runtime [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
-X87 MFlops/s
+X87 MFLOP/s
 
diff --git a/groups/atom/MEM.txt b/groups/atom/MEM.txt
index faf9a0a..db580e5 100644
--- a/groups/atom/MEM.txt
+++ b/groups/atom/MEM.txt
@@ -3,13 +3,21 @@ SHORT Main memory bandwidth in MBytes/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla Bla
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+-
+Profiling group to measure memory bandwidth drawn by this core.
 
diff --git a/groups/atom/TLB.txt b/groups/atom/TLB.txt
index d36b413..4952e6c 100644
--- a/groups/atom/TLB.txt
+++ b/groups/atom/TLB.txt
@@ -8,8 +8,9 @@ PMC0  DATA_TLB_MISSES_DTLB_MISS
 METRICS
 Runtime [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
+DTLB misses       PMC0
 DTLB miss rate    PMC0/FIXC0
 
 LONG
-Bla Bla
+The DTLB miss rate gives a measure how often a TLB miss occurred per instruction.
 
diff --git a/groups/broadwell/BRANCH.txt b/groups/broadwell/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwell/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwell/CLOCK.txt b/groups/broadwell/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwell/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwell/DATA.txt b/groups/broadwell/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwell/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwell/ENERGY.txt b/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..ae1756f
--- /dev/null
+++ b/groups/broadwell/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/broadwell/FALSE_SHARE.txt b/groups/broadwell/FALSE_SHARE.txt
new file mode 100644
index 0000000..bb26898
--- /dev/null
+++ b/groups/broadwell/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/groups/broadwell/FLOPS_AVX.txt b/groups/broadwell/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwell/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwell/FLOPS_DP.txt b/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwell/FLOPS_SP.txt b/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwell/ICACHE.txt b/groups/broadwell/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwell/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwell/L2.txt b/groups/broadwell/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwell/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwell/L2CACHE.txt b/groups/broadwell/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwell/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/L3.txt b/groups/broadwell/L3.txt
new file mode 100644
index 0000000..4026f85
--- /dev/null
+++ b/groups/broadwell/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwell/L3CACHE.txt b/groups/broadwell/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwell/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/RECOVERY.txt b/groups/broadwell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/broadwell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/broadwell/TLB_DATA.txt b/groups/broadwell/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/broadwell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwell/TLB_INSTR.txt b/groups/broadwell/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/broadwell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellD/BRANCH.txt b/groups/broadwellD/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwellD/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwellD/CACHES.txt b/groups/broadwellD/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/broadwellD/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/broadwellD/CLOCK.txt b/groups/broadwellD/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwellD/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwellD/DATA.txt b/groups/broadwellD/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwellD/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwellD/ENERGY.txt b/groups/broadwellD/ENERGY.txt
new file mode 100644
index 0000000..ae1756f
--- /dev/null
+++ b/groups/broadwellD/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/broadwellD/FALSE_SHARE.txt b/groups/broadwellD/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/broadwellD/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/broadwellD/FLOPS_AVX.txt b/groups/broadwellD/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwellD/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwellD/FLOPS_DP.txt b/groups/broadwellD/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwellD/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwellD/FLOPS_SP.txt b/groups/broadwellD/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwellD/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwellD/HA.txt b/groups/broadwellD/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/broadwellD/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/broadwellD/ICACHE.txt b/groups/broadwellD/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwellD/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwellD/L2.txt b/groups/broadwellD/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwellD/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwellD/L2CACHE.txt b/groups/broadwellD/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwellD/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellD/L3.txt b/groups/broadwellD/L3.txt
new file mode 100644
index 0000000..4026f85
--- /dev/null
+++ b/groups/broadwellD/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwellD/L3CACHE.txt b/groups/broadwellD/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwellD/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellD/MEM.txt b/groups/broadwellD/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/broadwellD/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/broadwellD/MEM_DP.txt b/groups/broadwellD/MEM_DP.txt
new file mode 100644
index 0000000..bfea358
--- /dev/null
+++ b/groups/broadwellD/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellD/MEM_SP.txt b/groups/broadwellD/MEM_SP.txt
new file mode 100644
index 0000000..e7d4642
--- /dev/null
+++ b/groups/broadwellD/MEM_SP.txt
@@ -0,0 +1,68 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellD/RECOVERY.txt b/groups/broadwellD/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/broadwellD/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/broadwellD/TLB_DATA.txt b/groups/broadwellD/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/broadwellD/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellD/TLB_INSTR.txt b/groups/broadwellD/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/broadwellD/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellEP/BRANCH.txt b/groups/broadwellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/broadwellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/broadwellEP/CACHES.txt b/groups/broadwellEP/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/broadwellEP/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/broadwellEP/CLOCK.txt b/groups/broadwellEP/CLOCK.txt
new file mode 100644
index 0000000..595d3a1
--- /dev/null
+++ b/groups/broadwellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwellEP/DATA.txt b/groups/broadwellEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/broadwellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwellEP/ENERGY.txt b/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..28f0256
--- /dev/null
+++ b/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/broadwellEP/FALSE_SHARE.txt b/groups/broadwellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..9f8a30e
--- /dev/null
+++ b/groups/broadwellEP/FALSE_SHARE.txt
@@ -0,0 +1,29 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/groups/broadwellEP/FLOPS_AVX.txt b/groups/broadwellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..eb047fa
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwellEP/FLOPS_DP.txt b/groups/broadwellEP/FLOPS_DP.txt
new file mode 100644
index 0000000..60b5d5a
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/broadwellEP/FLOPS_SP.txt b/groups/broadwellEP/FLOPS_SP.txt
new file mode 100644
index 0000000..2818d94
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/broadwellEP/HA.txt b/groups/broadwellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/broadwellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/broadwellEP/ICACHE.txt b/groups/broadwellEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwellEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwellEP/L2.txt b/groups/broadwellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/broadwellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/broadwellEP/L2CACHE.txt b/groups/broadwellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/broadwellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellEP/L3.txt b/groups/broadwellEP/L3.txt
new file mode 100644
index 0000000..7d84636
--- /dev/null
+++ b/groups/broadwellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwellEP/L3CACHE.txt b/groups/broadwellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/broadwellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellEP/MEM.txt b/groups/broadwellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/broadwellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/broadwellEP/MEM_DP.txt b/groups/broadwellEP/MEM_DP.txt
new file mode 100644
index 0000000..bfea358
--- /dev/null
+++ b/groups/broadwellEP/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellEP/MEM_SP.txt b/groups/broadwellEP/MEM_SP.txt
new file mode 100644
index 0000000..e7d4642
--- /dev/null
+++ b/groups/broadwellEP/MEM_SP.txt
@@ -0,0 +1,68 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
diff --git a/groups/broadwellEP/NUMA.txt b/groups/broadwellEP/NUMA.txt
new file mode 100644
index 0000000..8fdd0f1
--- /dev/null
+++ b/groups/broadwellEP/NUMA.txt
@@ -0,0 +1,41 @@
+SHORT Local and remote data transfers
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 REQUESTS_READS_LOCAL
+BBOX1C0 REQUESTS_READS_LOCAL
+BBOX0C1 REQUESTS_READS_REMOTE
+BBOX1C1 REQUESTS_READS_REMOTE
+BBOX0C2 REQUESTS_WRITES_LOCAL
+BBOX1C2 REQUESTS_WRITES_LOCAL
+BBOX0C3 REQUESTS_WRITES_REMOTE
+BBOX1C3 REQUESTS_WRITES_REMOTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local bandwidth [MByte/s]  1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64)/time
+Local data volume [GByte]  1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64
+Remote bandwidth [MByte/s]  1.E-06*((BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Remote data volume [GByte]  1.E-09*(BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+Total bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Total data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64)/time
+Local data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64
+Remote bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Remote data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+Total bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Total data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+--
+This performance group measures the data traffic of CPU sockets to local and remote
+CPU sockets. It uses the Home Agent for calculation. This may include also data from
+other sources than the memory controllers.
diff --git a/groups/broadwellEP/QPI.txt b/groups/broadwellEP/QPI.txt
new file mode 100644
index 0000000..20d7cdf
--- /dev/null
+++ b/groups/broadwellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formula:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Broadwell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/groups/broadwellEP/TLB_DATA.txt b/groups/broadwellEP/TLB_DATA.txt
new file mode 100644
index 0000000..89841d5
--- /dev/null
+++ b/groups/broadwellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellEP/TLB_INSTR.txt b/groups/broadwellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..b195452
--- /dev/null
+++ b/groups/broadwellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/core2/BRANCH.txt b/groups/core2/BRANCH.txt
index 2515d6c..3c66c00 100644
--- a/groups/core2/BRANCH.txt
+++ b/groups/core2/BRANCH.txt
@@ -19,12 +19,12 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ANY / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_INST_RETIRED_MISPRED / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_INST_RETIRED_MISPRED / BR_INST_RETIRED_ANY
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ANY
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/groups/core2/CACHE.txt b/groups/core2/CACHE.txt
index fd2af0c..1f446b8 100644
--- a/groups/core2/CACHE.txt
+++ b/groups/core2/CACHE.txt
@@ -10,26 +10,25 @@ PMC1  L1D_ALL_CACHE_REF
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
 
 LONG
 Formulas:
-Data cache request rate =  L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio =  L1D_REPL / L1D_ALL_CACHE_REF
+data cache request rate =  L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_CACHE_REF
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 
diff --git a/groups/core2/CLOCK.txt b/groups/core2/CLOCK.txt
new file mode 100644
index 0000000..4a5986f
--- /dev/null
+++ b/groups/core2/CLOCK.txt
@@ -0,0 +1,19 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE / INSTR_RETIRED_ANY
+-
+Most basic performance group measuring the the clock frequency of the machine.
+
diff --git a/groups/core2/DATA.txt b/groups/core2/DATA.txt
index c48ad99..0f5bca5 100644
--- a/groups/core2/DATA.txt
+++ b/groups/core2/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = INST_RETIRED_LOADS / INST_RETIRED_STORES
+Load to store ratio = INST_RETIRED_LOADS/INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/core2/FLOPS_DP.txt b/groups/core2/FLOPS_DP.txt
index 8e72f07..8164fd3 100644
--- a/groups/core2/FLOPS_DP.txt
+++ b/groups/core2/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -10,15 +10,14 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s    1.0E-06*(PMC0*2.0+PMC1)/time
 
 LONG
 Formulas:
-DP MFlops/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
+MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
 -
-Profiling group to measure double SSE flops. Dont forget that your code might also execute X87 flops.
+Profiling group to measure double SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
 On the number of SIMD_COMP_INST_RETIRED_PACKED_DOUBLE you can see how well your code was vectorized.
 
 
diff --git a/groups/core2/FLOPS_SP.txt b/groups/core2/FLOPS_SP.txt
index acd2df7..181be78 100644
--- a/groups/core2/FLOPS_SP.txt
+++ b/groups/core2/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -10,15 +10,14 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
 
 LONG
 Formulas:
-SP MFlops/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
+MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
 -
-Profiling group to measure single precision SSE flops. Dont forget that your code might also execute X87 flops.
+Profiling group to measure single precision SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
 On the number of SIMD_COMP_INST_RETIRED_PACKED_SINGLE you can see how well your code was vectorized.
 
 
diff --git a/groups/core2/FLOPS_X87.txt b/groups/core2/FLOPS_X87.txt
index 052356e..d44a2fa 100644
--- a/groups/core2/FLOPS_X87.txt
+++ b/groups/core2/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -9,14 +9,13 @@ PMC0  X87_OPS_RETIRED_ANY
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
 Formulas:
-X87 MFlops/s = 1.0E-06*X87_OPS_RETIRED_ANY/time
+X87 MFLOP/s = 1.0E-06*X87_OPS_RETIRED_ANY/time
 -
-Profiling group to measure X87 flops. Note that also non computational operations
+Profiling group to measure X87 FLOPs. Note that also non computational operations
 are measured by this event.
 
diff --git a/groups/core2/L2.txt b/groups/core2/L2.txt
index 88c75c5..d8cbe0d 100644
--- a/groups/core2/L2.txt
+++ b/groups/core2/L2.txt
@@ -10,23 +10,26 @@ PMC1  L1D_M_EVICT
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
 L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the 
-number of modified cachelines evicted from the L1. 
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1.
 Note that this bandwidth also includes data transfers due to a
 write allocate load on a store miss in L1.
 
diff --git a/groups/core2/L2CACHE.txt b/groups/core2/L2CACHE.txt
index 34c607a..d3b8776 100644
--- a/groups/core2/L2CACHE.txt
+++ b/groups/core2/L2CACHE.txt
@@ -23,13 +23,12 @@ L2 miss rate  = L2_RQSTS_SELF_I_STATE / INSTR_RETIRED_ANY
 L2 miss ratio = L2_RQSTS_SELF_I_STATE / L2_RQSTS_THIS_CORE_ALL_MESI
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/core2/MEM.txt b/groups/core2/MEM.txt
index b205dc4..f6522ba 100644
--- a/groups/core2/MEM.txt
+++ b/groups/core2/MEM.txt
@@ -5,18 +5,19 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+PMC1  BUS_TRANS_WB_THIS_CORE_ALL_A
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Memory data volume [GBytes] 1.0E-09*PMC0*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
 Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
-Memory data volume [GBytes] 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
 -
-Profiling group to measure memory bandwidth drawn by this core. 
+Profiling group to measure memory bandwidth drawn by this core.
diff --git a/groups/core2/TLB.txt b/groups/core2/TLB.txt
index d536d88..80742f4 100644
--- a/groups/core2/TLB.txt
+++ b/groups/core2/TLB.txt
@@ -10,7 +10,6 @@ PMC1  L1D_ALL_CACHE_REF
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB request rate    PMC1/FIXC0
 DTLB miss rate    PMC0/FIXC0
@@ -22,9 +21,9 @@ L1 DTLB request rate =  L1D_ALL_CACHE_REF / INSTR_RETIRED_ANY
 DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
 L1 DTLB miss ratio  =  DTLB_MISSES_ANY / L1D_ALL_CACHE_REF
 -
-L1 DTLB request  rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss  rate gives a measure how often a TLB miss occured
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
 per instruction. And finally L1 DTLB  miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
 
diff --git a/groups/core2/UOPS.txt b/groups/core2/UOPS.txt
new file mode 100644
index 0000000..8167416
--- /dev/null
+++ b/groups/core2/UOPS.txt
@@ -0,0 +1,22 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  RS_UOPS_DISPATCHED_ALL
+PMC1  UOPS_RETIRED_ANY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Executed UOPs PMC0
+Retired UOPs PMC1
+
+LONG
+Performance group measures the executed and retired micro ops. The difference
+between executed and retired uOPs are the speculatively executed uOPs.
diff --git a/groups/core2/UOPS_RETIRE.txt b/groups/core2/UOPS_RETIRE.txt
new file mode 100644
index 0000000..be0bf73
--- /dev/null
+++ b/groups/core2/UOPS_RETIRE.txt
@@ -0,0 +1,25 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio PMC0/FIXC1
+Unused cycles ratio PMC1/FIXC1
+
+
+LONG
+Formulas:
+Used cycles ratio = UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio = UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+-
+This performance group returns the ratios of used and unused CPU cycles. Here
+unused cycles are cycles where no operation is performed due to some stall.
diff --git a/groups/haswell/BRANCH.txt b/groups/haswell/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/haswell/BRANCH.txt
+++ b/groups/haswell/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/haswell/CACHES.txt b/groups/haswell/CACHES.txt
new file mode 100644
index 0000000..d0d6f33
--- /dev/null
+++ b/groups/haswell/CACHES.txt
@@ -0,0 +1,71 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 CACHE_LOOKUP_READ_MESI
+CBOX1C0 CACHE_LOOKUP_READ_MESI
+CBOX2C0 CACHE_LOOKUP_READ_MESI
+CBOX3C0 CACHE_LOOKUP_READ_MESI
+CBOX0C1 CACHE_LOOKUP_WRITE_MESI
+CBOX1C1 CACHE_LOOKUP_WRITE_MESI
+CBOX2C1 CACHE_LOOKUP_WRITE_MESI
+CBOX3C1 CACHE_LOOKUP_WRITE_MESI
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_WRITE_MESI))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/haswell/CLOCK.txt b/groups/haswell/CLOCK.txt
index 276cf16..a2556b4 100644
--- a/groups/haswell/CLOCK.txt
+++ b/groups/haswell/CLOCK.txt
@@ -7,7 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
 PWR0  PWR_PKG_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
diff --git a/groups/haswell/DATA.txt b/groups/haswell/DATA.txt
index 5f04a23..17948d4 100644
--- a/groups/haswell/DATA.txt
+++ b/groups/haswell/DATA.txt
@@ -4,19 +4,24 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOP_RETIRED_LOADS
-PMC1  MEM_UOP_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+PMC2  UOPS_RETIRED_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC2
+Store ratio PMC1/PMC2
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load ratio = MEM_UOPS_RETIRED_LOADS/UOPS_RETIRED_ALL
+Store ratio = MEM_UOPS_RETIRED_STORES/UOPS_RETIRED_ALL
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/haswell/ENERGY.txt b/groups/haswell/ENERGY.txt
index 15b1c45..e8bed3a 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/haswell/ENERGY.txt
@@ -7,10 +7,13 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
+
+
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
@@ -19,16 +22,18 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 Haswell implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) and DRAM level.
-The PP0 energy domain is often refered to an integrated GPU.
 
diff --git a/groups/haswell/FALSE_SHARE.txt b/groups/haswell/FALSE_SHARE.txt
new file mode 100644
index 0000000..43ea23b
--- /dev/null
+++ b/groups/haswell/FALSE_SHARE.txt
@@ -0,0 +1,28 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSD25).
diff --git a/groups/haswell/FLOPS_AVX.txt b/groups/haswell/FLOPS_AVX.txt
new file mode 100644
index 0000000..9efdd1d
--- /dev/null
+++ b/groups/haswell/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0   AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/groups/haswell/ICACHE.txt b/groups/haswell/ICACHE.txt
index 6ce3ce8..f1e2335 100644
--- a/groups/haswell/ICACHE.txt
+++ b/groups/haswell/ICACHE.txt
@@ -6,6 +6,8 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  ICACHE_ACCESSES
 PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -15,11 +17,17 @@ CPI  FIXC1/FIXC0
 L1I request rate PMC0/FIXC0
 L1I miss rate PMC1/FIXC0
 L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/haswell/L2.txt b/groups/haswell/L2.txt
index 47d8ec7..60c7f79 100644
--- a/groups/haswell/L2.txt
+++ b/groups/haswell/L2.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -21,13 +22,16 @@ L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2D load bandwidth [MBytes/s]  1.0E-06*L1D_REPLACEMENT*64.0/time
-L2D load data volume [GBytes]  1.0E-09*L1D_REPLACEMENT*64.0
-L2D evict bandwidth [MBytes/s]  1.0E-06*L2_TRANS_L1D_WB*64.0/time
-L2D evict data volume [GBytes]  1.0E-09*L2_TRANS_L1D_WB*64.0
-L2 bandwidth [MBytes/s] 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
-the L1 data cache to the L2 cache.
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/haswell/L2CACHE.txt b/groups/haswell/L2CACHE.txt
index 8186f69..9b5dd4b 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/haswell/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_RQSTS_REFERENCES
+PMC0  L2_TRANS_ALL_REQUESTS
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_RQSTS_REFERENCES / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_RQSTS_REFERENCES
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/haswell/L3.txt b/groups/haswell/L3.txt
index 42d6e4a..f63a918 100644
--- a/groups/haswell/L3.txt
+++ b/groups/haswell/L3.txt
@@ -5,28 +5,32 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L2_LINES_IN_ALL
-PMC1  L2_LINES_OUT_DEMAND_DIRTY
+PMC1  L2_TRANS_L2_WB
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
 L3 and  measured cores L2 caches. Note that this bandwidth also includes data
 transfers due to a write allocate load on a store miss in L2.
 
diff --git a/groups/haswell/L3CACHE.txt b/groups/haswell/L3CACHE.txt
index d4fd89e..f863daa 100644
--- a/groups/haswell/L3CACHE.txt
+++ b/groups/haswell/L3CACHE.txt
@@ -6,30 +6,30 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/haswell/RECOVERY.txt b/groups/haswell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/haswell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/haswell/TLB_DATA.txt b/groups/haswell/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/haswell/TLB_DATA.txt
+++ b/groups/haswell/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT  L1 Data TLB miss rate/ratio
+SHORT  L2 data TLB miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/haswell/TLB_INSTR.txt b/groups/haswell/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/haswell/TLB_INSTR.txt
+++ b/groups/haswell/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/haswell/UOPS.txt b/groups/haswell/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/haswell/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/haswell/UOPS_EXEC.txt b/groups/haswell/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/haswell/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswell/UOPS_ISSUE.txt b/groups/haswell/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/haswell/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswell/UOPS_RETIRE.txt b/groups/haswell/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/haswell/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/BRANCH.txt b/groups/haswellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/haswellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/haswellEP/CACHES.txt b/groups/haswellEP/CACHES.txt
new file mode 100644
index 0000000..3c13a52
--- /dev/null
+++ b/groups/haswellEP/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/haswellEP/CBOX.txt b/groups/haswellEP/CBOX.txt
new file mode 100644
index 0000000..d9cc13c
--- /dev/null
+++ b/groups/haswellEP/CBOX.txt
@@ -0,0 +1,61 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M
+CBOX1C0 LLC_VICTIMS_M
+CBOX2C0 LLC_VICTIMS_M
+CBOX3C0 LLC_VICTIMS_M
+CBOX4C0 LLC_VICTIMS_M
+CBOX5C0 LLC_VICTIMS_M
+CBOX6C0 LLC_VICTIMS_M
+CBOX7C0 LLC_VICTIMS_M
+CBOX8C0 LLC_VICTIMS_M
+CBOX9C0 LLC_VICTIMS_M
+CBOX10C0 LLC_VICTIMS_M
+CBOX11C0 LLC_VICTIMS_M
+CBOX12C0 LLC_VICTIMS_M
+CBOX13C0 LLC_VICTIMS_M
+CBOX14C0 LLC_VICTIMS_M
+CBOX15C0 LLC_VICTIMS_M
+CBOX16C0 LLC_VICTIMS_M
+CBOX17C0 LLC_VICTIMS_M
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX15C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX16C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX17C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0)/FIXC0
+LL2 data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1+CBOX15C1:STATE=0x1+CBOX16C1:STATE=0x1+CBOX17C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC Misses Per Instruction = sum(LLC_VICTIMS_M)/INSTR_RETIRED_ANY
+LL2 data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY)*64*1E-6
+-
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/haswellEP/CLOCK.txt b/groups/haswellEP/CLOCK.txt
new file mode 100644
index 0000000..a2556b4
--- /dev/null
+++ b/groups/haswellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/haswellEP/DATA.txt b/groups/haswellEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/haswellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/haswellEP/ENERGY.txt b/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..6c26b30
--- /dev/null
+++ b/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/haswellEP/FALSE_SHARE.txt b/groups/haswellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..ce1a8bb
--- /dev/null
+++ b/groups/haswellEP/FALSE_SHARE.txt
@@ -0,0 +1,34 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSW150).
diff --git a/groups/haswellEP/FLOPS_AVX.txt b/groups/haswellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..9efdd1d
--- /dev/null
+++ b/groups/haswellEP/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0   AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/groups/haswellEP/HA.txt b/groups/haswellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/groups/haswellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/groups/haswellEP/ICACHE.txt b/groups/haswellEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/haswellEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/haswellEP/L2.txt b/groups/haswellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/groups/haswellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/groups/haswellEP/L2CACHE.txt b/groups/haswellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/haswellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/L3.txt b/groups/haswellEP/L3.txt
new file mode 100644
index 0000000..0109db3
--- /dev/null
+++ b/groups/haswellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/haswellEP/L3CACHE.txt b/groups/haswellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/groups/haswellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/MEM.txt b/groups/haswellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/groups/haswellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/groups/haswellEP/NUMA.txt b/groups/haswellEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/haswellEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/haswellEP/QPI.txt b/groups/haswellEP/QPI.txt
new file mode 100644
index 0000000..4ad0cf8
--- /dev/null
+++ b/groups/haswellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formula:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Haswell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/groups/haswellEP/RECOVERY.txt b/groups/haswellEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/haswellEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/haswellEP/SBOX.txt b/groups/haswellEP/SBOX.txt
new file mode 100644
index 0000000..246deea
--- /dev/null
+++ b/groups/haswellEP/SBOX.txt
@@ -0,0 +1,28 @@
+SHORT Ring Transfer bandwidth
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 RING_BL_USED_ANY
+SBOX1C0 RING_BL_USED_ANY
+SBOX2C0 RING_BL_USED_ANY
+SBOX3C0 RING_BL_USED_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/time
+Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32
+
+LONG
+Formula:
+Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time
+Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32)
+--
+The SBOXes manage the transfer between the socket local ring(s). For micro architectures
+prior to Haswell, the SBOX and QBOX was similar as only a single ring was used.
+Haswell systems with a high core count assemble two rings that are connected through
+the SBOXes, the traffic between the sockets is handled by the QBOXes.
diff --git a/groups/haswellEP/TLB_DATA.txt b/groups/haswellEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/haswellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswellEP/TLB_INSTR.txt b/groups/haswellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/haswellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswellEP/UOPS.txt b/groups/haswellEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/haswellEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/haswellEP/UOPS_EXEC.txt b/groups/haswellEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/haswellEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/UOPS_ISSUE.txt b/groups/haswellEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/haswellEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/haswellEP/UOPS_RETIRE.txt b/groups/haswellEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/haswellEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/interlagos/BRANCH.txt b/groups/interlagos/BRANCH.txt
index 1ae9f36..7495b74 100644
--- a/groups/interlagos/BRANCH.txt
+++ b/groups/interlagos/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
 PMC1  RETIRED_BRANCH_INSTR
 PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3  RETIRED_TAKEN_BRANCH_INSTR
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
 -
-The rates state how often in average a branch or a mispredicted branch occured
+The rates state how often on average a branch or a mispredicted branch occurred
 per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/interlagos/CACHE.txt b/groups/interlagos/CACHE.txt
index 23343a5..0d785fc 100644
--- a/groups/interlagos/CACHE.txt
+++ b/groups/interlagos/CACHE.txt
@@ -8,25 +8,25 @@ PMC3  DATA_CACHE_MISSES_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Data cache misses PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2)/PMC0
-Data cache miss ratio (PMC2)/PMC1
+data cache misses PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2)/PMC0
+data cache miss ratio (PMC2)/PMC1
 
 LONG
 Formulas:
-Data cache misses = DATA_CACHE_MISSES_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-Data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS
-Data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_MISSES_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 
diff --git a/groups/interlagos/CPI.txt b/groups/interlagos/CPI.txt
index 47711f4..c0746e7 100644
--- a/groups/interlagos/CPI.txt
+++ b/groups/interlagos/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
diff --git a/groups/interlagos/DATA.txt b/groups/interlagos/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/interlagos/DATA.txt
+++ b/groups/interlagos/DATA.txt
@@ -6,11 +6,11 @@ PMC1  LS_DISPATCH_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
 -
 This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/interlagos/FLOPS_DP.txt b/groups/interlagos/FLOPS_DP.txt
index d7f5f57..27e58c3 100644
--- a/groups/interlagos/FLOPS_DP.txt
+++ b/groups/interlagos/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
@@ -9,15 +9,15 @@ PMC3  RETIRED_FLOPS_DOUBLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+DP MFLOP/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
 -
-Profiling group to measure double precisision flop rate.
+Profiling group to measure double precisision FLOP rate.
 
 
diff --git a/groups/interlagos/FLOPS_SP.txt b/groups/interlagos/FLOPS_SP.txt
index 1c4dcc3..7db569f 100644
--- a/groups/interlagos/FLOPS_SP.txt
+++ b/groups/interlagos/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
@@ -9,15 +9,15 @@ PMC3  RETIRED_FLOPS_SINGLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+SP MFLOP/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
 -
-Profiling group to measure single precision flop rate.
+Profiling group to measure single precision FLOP rate.
 
 
diff --git a/groups/interlagos/FPU_EXCEPTION.txt b/groups/interlagos/FPU_EXCEPTION.txt
index 5c586e4..0969ae1 100644
--- a/groups/interlagos/FPU_EXCEPTION.txt
+++ b/groups/interlagos/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
 Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
 FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
 -
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
 There might be a large penalty if there are too many floating point
 exceptions.
 
diff --git a/groups/interlagos/ICACHE.txt b/groups/interlagos/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/interlagos/ICACHE.txt
+++ b/groups/interlagos/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  RETIRED_INSTRUCTIONS
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC1+PMC2
-Instruction cache request rate   PMC0/PMC3
-Instruction cache miss rate    (PMC1+PMC2)/PMC3
-Instruction cache miss ratio   (PMC1+PMC2)/PMC0
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
 
 LONG
 Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate  (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/interlagos/L2.txt b/groups/interlagos/L2.txt
index a1f5714..5bf1843 100644
--- a/groups/interlagos/L2.txt
+++ b/groups/interlagos/L2.txt
@@ -16,14 +16,14 @@ LONG
 Formulas:
 L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time
 L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64
-Cache refill bandwidth System/L2 [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-Cache refill bandwidth System [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
+Cache refill bandwidth system/L2 [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
+Cache refill bandwidth system [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1. 
-Note that this bandwidth also includes data transfers due to a
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also included data transfers due to a
 write allocate load on a store miss in L1 and copy back transfers if
-originated from L2. L2-L1 data volume is the total data volume transfered 
+originated from L2. L2-L2 data volume is the total data volume transferred
 between L2 and L1.
 
diff --git a/groups/interlagos/L2CACHE.txt b/groups/interlagos/L2CACHE.txt
index 17209e8..49b9555 100644
--- a/groups/interlagos/L2CACHE.txt
+++ b/groups/interlagos/L2CACHE.txt
@@ -7,23 +7,23 @@ PMC2  L2_CACHE_MISS_DC_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-L2 request rate   (PMC1)/PMC0
+L2 request rate   PMC1/PMC0
 L2 miss rate   PMC2/PMC0
-L2 miss ratio   PMC2/(PMC1)
+L2 miss ratio   PMC2/PMC1
 
 LONG
 Formulas:
-L2 request rate = (L2_REQUESTS_ALL)/INSTRUCTIONS_RETIRED
+L2 request rate = L2_REQUESTS_ALL/INSTRUCTIONS_RETIRED
 L2 miss rate  = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
-L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL)
+L2 miss ratio = L2_MISSES_ALL/L2_REQUESTS_ALL
 -
 This group measures the locality of your data accesses with regard to the L2
 Cache. L2 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction.  The L2 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L2 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level.  While the Data cache miss rate might be
-given by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction.  The L2 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L2 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level.  While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
 possible by increasing your cache reuse.  This group is inspired from the
 whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
 AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/interlagos/L3.txt b/groups/interlagos/L3.txt
index c1a6f17..5c9ea4d 100644
--- a/groups/interlagos/L3.txt
+++ b/groups/interlagos/L3.txt
@@ -7,18 +7,23 @@ PMC2  CPU_CLOCKS_UNHALTED
 
 METRICS
 Runtime (RDTSC) [s] time
-L3 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
-L3 data volume [GBytes]    1.0E-09*(PMC0+PMC1)*64.0
-L3 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L3 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 bandwidth [MBytes/s]   1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
-L3 data volume [GBytes]   1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
-L3 refill bandwidth [MBytes/s]   1.0E-06*L2_FILL_WB_FILL*64/time
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_FILL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_FILL_WB_FILL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_FILL_WB_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L3 to L2 and the
-number of modified cachelines evicted from the L2. 
+computed by the number of cache line loaded from L3 to L2 and the
+number of modified cache lines evicted from the L2.
 
diff --git a/groups/interlagos/L3CACHE.txt b/groups/interlagos/L3CACHE.txt
index 4bef1a7..5a442c6 100644
--- a/groups/interlagos/L3CACHE.txt
+++ b/groups/interlagos/L3CACHE.txt
@@ -16,20 +16,20 @@ L3 average access latency [cycles]  UPMC2/UPMC3
 
 LONG
 Formulas:
-L3 request rate = (UNC_READ_REQ_TO_L3_ALL)/INSTRUCTIONS_RETIRED
-L3 miss rate  = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
+L3 request rate = UNC_READ_REQ_TO_L3_ALL/INSTRUCTIONS_RETIRED
+L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
 L3 miss ratio = UNC_L3_CACHE_MISS_ALL/UNC_READ_REQ_TO_L3_ALL
 L3 average access latency =  UNC_L3_LATENCY_CYCLE_COUNT/UNC_L3_LATENCY_REQUEST_COUNT
 -
 This group measures the locality of your data accesses with regard to the L3
 Cache. L3 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction.  The L3 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L3 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level.  While the Data cache miss rate might be
-given by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction.  The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level.  While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
 possible by increasing your cache reuse.  This group was inspired from the
-whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
-AMD Phenom Processors- from Paul J. Drongowski.
+whitepaper - Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
+AMD Phenom Processors - from Paul J. Drongowski.
 
 
diff --git a/groups/interlagos/LINKS.txt b/groups/interlagos/LINKS.txt
index 649f0d1..4b8ac22 100644
--- a/groups/interlagos/LINKS.txt
+++ b/groups/interlagos/LINKS.txt
@@ -20,7 +20,7 @@ Link bandwidth L1 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time
 Link bandwidth L2 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time
 Link bandwidth L3 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time
 -
-Profiling group to measure the Hypertransport link bandwidth for the four links
-of a local node. This indicates the data flow between different ccNUMA nodes.
+Profiling group to measure the HyperTransport link bandwidth for the four links
+of a local node. This indicates the# data flow between different ccNUMA nodes.
 
 
diff --git a/groups/interlagos/MEM.txt b/groups/interlagos/MEM.txt
index 22aa19e..2fa9dfe 100644
--- a/groups/interlagos/MEM.txt
+++ b/groups/interlagos/MEM.txt
@@ -16,5 +16,5 @@ Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
 
diff --git a/groups/interlagos/NUMA.txt b/groups/interlagos/NUMA.txt
index d94e735..ed13dbe 100644
--- a/groups/interlagos/NUMA.txt
+++ b/groups/interlagos/NUMA.txt
@@ -20,8 +20,8 @@ DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/ti
 DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
 DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
 -
-Profiling group to measure the traffic from local CPU to the different 
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded 
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
 code. You must first determine on which memory domains your code is running.
 A code should only have significant traffic to its own memory domain.
 
diff --git a/groups/interlagos/NUMA_0_3.txt b/groups/interlagos/NUMA_0_3.txt
new file mode 100644
index 0000000..ed13dbe
--- /dev/null
+++ b/groups/interlagos/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/interlagos/NUMA_4_7.txt b/groups/interlagos/NUMA_4_7.txt
new file mode 100644
index 0000000..ae16499
--- /dev/null
+++ b/groups/interlagos/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/ivybridge/BRANCH.txt b/groups/ivybridge/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/ivybridge/BRANCH.txt
+++ b/groups/ivybridge/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/ivybridge/CLOCK.txt b/groups/ivybridge/CLOCK.txt
index 80891d4..278821e 100644
--- a/groups/ivybridge/CLOCK.txt
+++ b/groups/ivybridge/CLOCK.txt
@@ -7,7 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
 PWR0  PWR_PKG_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
diff --git a/groups/ivybridge/DATA.txt b/groups/ivybridge/DATA.txt
index 5f04a23..967cbad 100644
--- a/groups/ivybridge/DATA.txt
+++ b/groups/ivybridge/DATA.txt
@@ -4,19 +4,19 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOP_RETIRED_LOADS
-PMC1  MEM_UOP_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/ivybridge/ENERGY.txt b/groups/ivybridge/ENERGY.txt
index 3f70077..541c3ad 100644
--- a/groups/ivybridge/ENERGY.txt
+++ b/groups/ivybridge/ENERGY.txt
@@ -7,6 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
@@ -19,15 +20,18 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
-Power PP0 [W] PWR1/time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 IvyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
-
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridge/FALSE_SHARE.txt b/groups/ivybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/ivybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/ivybridge/FLOPS_AVX.txt b/groups/ivybridge/FLOPS_AVX.txt
index e8074c1..ea459f4 100644
--- a/groups/ivybridge/FLOPS_AVX.txt
+++ b/groups/ivybridge/FLOPS_AVX.txt
@@ -1,4 +1,4 @@
-SHORT Packed AVX MFlops/s
+SHORT Packed AVX MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -12,14 +12,14 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-32b packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
 
 LONG
 Formula:
-SP MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-DP MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
-AVX flops rates. Please note that the current flop measurements on IvyBridge are
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on IvyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
 
diff --git a/groups/ivybridge/FLOPS_DP.txt b/groups/ivybridge/FLOPS_DP.txt
index 1e47b50..b5e8273 100644
--- a/groups/ivybridge/FLOPS_DP.txt
+++ b/groups/ivybridge/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -13,16 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 -
-SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
 
diff --git a/groups/ivybridge/FLOPS_SP.txt b/groups/ivybridge/FLOPS_SP.txt
index 0be0721..819b81c 100644
--- a/groups/ivybridge/FLOPS_SP.txt
+++ b/groups/ivybridge/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (FP_256_PACKED_SINGLE*8)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 -
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
 
diff --git a/groups/ivybridge/ICACHE.txt b/groups/ivybridge/ICACHE.txt
index 6ce3ce8..f1e2335 100644
--- a/groups/ivybridge/ICACHE.txt
+++ b/groups/ivybridge/ICACHE.txt
@@ -6,6 +6,8 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  ICACHE_ACCESSES
 PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -15,11 +17,17 @@ CPI  FIXC1/FIXC0
 L1I request rate PMC0/FIXC0
 L1I miss rate PMC1/FIXC0
 L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridge/L2.txt b/groups/ivybridge/L2.txt
index 5345b7a..376e974 100644
--- a/groups/ivybridge/L2.txt
+++ b/groups/ivybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also output total data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
 L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
 
diff --git a/groups/ivybridge/L2CACHE.txt b/groups/ivybridge/L2CACHE.txt
index 3d7c36e..9b5dd4b 100644
--- a/groups/ivybridge/L2CACHE.txt
+++ b/groups/ivybridge/L2CACHE.txt
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/ivybridge/L3.txt b/groups/ivybridge/L3.txt
index 9a7c914..f0a8aad 100644
--- a/groups/ivybridge/L3.txt
+++ b/groups/ivybridge/L3.txt
@@ -12,21 +12,25 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
-L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
 transfers due to a write allocate load on a store miss in L2.
 
diff --git a/groups/ivybridge/L3CACHE.txt b/groups/ivybridge/L3CACHE.txt
index d4fd89e..9f3036f 100644
--- a/groups/ivybridge/L3CACHE.txt
+++ b/groups/ivybridge/L3CACHE.txt
@@ -6,30 +6,31 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/ivybridge/MEM.txt b/groups/ivybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/ivybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket.  Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket.  If a thread group contains multiple
-threads only one thread per socket will show the results.  Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/ivybridge/MEM_DP.txt b/groups/ivybridge/MEM_DP.txt
deleted file mode 100644
index 7bc76cd..0000000
--- a/groups/ivybridge/MEM_DP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s  1.0E-06*(4.0*PMC2)/time
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions.  Please note that the 
-current flop measurements on IvyBridge are potentially wrong. So you cannot trust 
-these counters at the moment!
-
diff --git a/groups/ivybridge/MEM_SP.txt b/groups/ivybridge/MEM_SP.txt
deleted file mode 100644
index 4388cc4..0000000
--- a/groups/ivybridge/MEM_SP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s  1.0E-06*(8.0*PMC2)/time
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions. Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot 
-trust these counters at the moment!
-
diff --git a/groups/ivybridge/RECOVERY.txt b/groups/ivybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/ivybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/ivybridge/TLB_DATA.txt b/groups/ivybridge/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/ivybridge/TLB_DATA.txt
+++ b/groups/ivybridge/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT  L1 Data TLB miss rate/ratio
+SHORT  L2 data TLB miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/ivybridge/TLB_INSTR.txt b/groups/ivybridge/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/ivybridge/TLB_INSTR.txt
+++ b/groups/ivybridge/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/ivybridge/UOPS.txt b/groups/ivybridge/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/ivybridge/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/ivybridge/UOPS_EXEC.txt b/groups/ivybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/ivybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridge/UOPS_ISSUE.txt b/groups/ivybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/ivybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridge/UOPS_RETIRE.txt b/groups/ivybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/ivybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/BRANCH.txt b/groups/ivybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/ivybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/ivybridgeEP/CACHES.txt b/groups/ivybridgeEP/CACHES.txt
new file mode 100644
index 0000000..ad63925
--- /dev/null
+++ b/groups/ivybridgeEP/CACHES.txt
@@ -0,0 +1,121 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DIRTY_ALL
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX8C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX9C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX10C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX11C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX12C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX13C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX14C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX8C1 LLC_VICTIMS_M_STATE
+CBOX9C1 LLC_VICTIMS_M_STATE
+CBOX10C1 LLC_VICTIMS_M_STATE
+CBOX11C1 LLC_VICTIMS_M_STATE
+CBOX12C1 LLC_VICTIMS_M_STATE
+CBOX13C1 LLC_VICTIMS_M_STATE
+CBOX14C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0
+L3 to memory bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64/time
+L3 to memory data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/groups/ivybridgeEP/CBOX.txt b/groups/ivybridgeEP/CBOX.txt
new file mode 100644
index 0000000..ca6c6d5
--- /dev/null
+++ b/groups/ivybridgeEP/CBOX.txt
@@ -0,0 +1,55 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M_STATE
+CBOX1C0 LLC_VICTIMS_M_STATE
+CBOX2C0 LLC_VICTIMS_M_STATE
+CBOX3C0 LLC_VICTIMS_M_STATE
+CBOX4C0 LLC_VICTIMS_M_STATE
+CBOX5C0 LLC_VICTIMS_M_STATE
+CBOX6C0 LLC_VICTIMS_M_STATE
+CBOX7C0 LLC_VICTIMS_M_STATE
+CBOX8C0 LLC_VICTIMS_M_STATE
+CBOX9C0 LLC_VICTIMS_M_STATE
+CBOX10C0 LLC_VICTIMS_M_STATE
+CBOX11C0 LLC_VICTIMS_M_STATE
+CBOX12C0 LLC_VICTIMS_M_STATE
+CBOX13C0 LLC_VICTIMS_M_STATE
+CBOX14C0 LLC_VICTIMS_M_STATE
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)/FIXC0
+LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC misses per instruction sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] sum(LLC_LOOKUP_ANY)*64*1E-6
+--
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/ivybridgeEP/CLOCK.txt b/groups/ivybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..278821e
--- /dev/null
+++ b/groups/ivybridgeEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/ivybridgeEP/DATA.txt b/groups/ivybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/ivybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/ivybridgeEP/ENERGY.txt b/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..07bc59c
--- /dev/null
+++ b/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridgeEP/FALSE_SHARE.txt b/groups/ivybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..1d0a49e
--- /dev/null
+++ b/groups/ivybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,32 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
diff --git a/groups/ivybridgeEP/FLOPS_AVX.txt b/groups/ivybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..7ca4aca
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on
+IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_DP.txt b/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..b5e8273
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_SP.txt b/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..819b81c
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/ICACHE.txt b/groups/ivybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/ivybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridgeEP/L2.txt b/groups/ivybridgeEP/L2.txt
new file mode 100644
index 0000000..376e974
--- /dev/null
+++ b/groups/ivybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
+
diff --git a/groups/ivybridgeEP/L2CACHE.txt b/groups/ivybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/groups/ivybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/L3.txt b/groups/ivybridgeEP/L3.txt
new file mode 100644
index 0000000..f0a8aad
--- /dev/null
+++ b/groups/ivybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/ivybridgeEP/L3CACHE.txt b/groups/ivybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..9f3036f
--- /dev/null
+++ b/groups/ivybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/MEM.txt b/groups/ivybridgeEP/MEM.txt
new file mode 100644
index 0000000..fd80c2c
--- /dev/null
+++ b/groups/ivybridgeEP/MEM.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+
diff --git a/groups/ivybridgeEP/MEM_DP.txt b/groups/ivybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..da40bb9
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_DP.txt
@@ -0,0 +1,68 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.  Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/MEM_SP.txt b/groups/ivybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..7fe9ea9
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_SP.txt
@@ -0,0 +1,70 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on IvyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/NUMA.txt b/groups/ivybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/ivybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/ivybridgeEP/QPI.txt b/groups/ivybridgeEP/QPI.txt
new file mode 100644
index 0000000..4dbf8a4
--- /dev/null
+++ b/groups/ivybridgeEP/QPI.txt
@@ -0,0 +1,52 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX1C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX2C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX0C1 TXL_FLITS_G0_DATA
+SBOX1C1 TXL_FLITS_G0_DATA
+SBOX2C1 TXL_FLITS_G0_DATA
+SBOX0C2 TXL_FLITS_G0_NON_DATA
+SBOX1C2 TXL_FLITS_G0_NON_DATA
+SBOX2C2 TXL_FLITS_G0_NON_DATA
+SBOX0C3 SBOX_CLOCKTICKS
+SBOX1C3 SBOX_CLOCKTICKS
+SBOX2C3 SBOX_CLOCKTICKS
+SBOX0FIX QPI_RATE
+SBOX1FIX QPI_RATE
+SBOX2FIX QPI_RATE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI Speed Link 0 [GT/s] ((SBOX0C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 1 [GT/s] ((SBOX1C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 2 [GT/s] ((SBOX2C3)/time)*inverseClock*(8/1000)
+QPI Rate Link 0 [GT/s] 1.E-09*SBOX0FIX
+QPI Rate Link 1 [GT/s] 1.E-09*SBOX1FIX
+QPI Rate Link 2 [GT/s] 1.E-09*SBOX2FIX
+data from QPI to LLC [MByte] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0)*8
+QPI data volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8
+QPI data bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8/time
+QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8
+QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time
+
+LONG
+Formula:
+QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000)
+QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE)
+data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
+QPI data volume [MByte] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)
+QPI data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI link volume [MByte] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface.
+
diff --git a/groups/ivybridgeEP/RECOVERY.txt b/groups/ivybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/ivybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/ivybridgeEP/TLB_DATA.txt b/groups/ivybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/TLB_INSTR.txt b/groups/ivybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/UNCORECLOCK.txt b/groups/ivybridgeEP/UNCORECLOCK.txt
new file mode 100644
index 0000000..fef0d36
--- /dev/null
+++ b/groups/ivybridgeEP/UNCORECLOCK.txt
@@ -0,0 +1,84 @@
+SHORT All Clocks
+
+EVENTSET
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 CBOX_CLOCKTICKS
+CBOX1C0 CBOX_CLOCKTICKS
+CBOX2C0 CBOX_CLOCKTICKS
+CBOX3C0 CBOX_CLOCKTICKS
+CBOX4C0 CBOX_CLOCKTICKS
+CBOX5C0 CBOX_CLOCKTICKS
+CBOX6C0 CBOX_CLOCKTICKS
+CBOX7C0 CBOX_CLOCKTICKS
+CBOX8C0 CBOX_CLOCKTICKS
+CBOX9C0 CBOX_CLOCKTICKS
+CBOX10C0 CBOX_CLOCKTICKS
+CBOX11C0 CBOX_CLOCKTICKS
+CBOX12C0 CBOX_CLOCKTICKS
+CBOX13C0 CBOX_CLOCKTICKS
+CBOX14C0 CBOX_CLOCKTICKS
+MBOX0C0 DRAM_CLOCKTICKS
+MBOX1C0 DRAM_CLOCKTICKS
+MBOX2C0 DRAM_CLOCKTICKS
+MBOX3C0 DRAM_CLOCKTICKS
+MBOX0FIX DRAM_CLOCKTICKS
+MBOX1FIX DRAM_CLOCKTICKS
+MBOX2FIX DRAM_CLOCKTICKS
+MBOX3FIX DRAM_CLOCKTICKS
+SBOX0C0 SBOX_CLOCKTICKS
+SBOX1C0 SBOX_CLOCKTICKS
+SBOX2C0 SBOX_CLOCKTICKS
+UBOXFIX UBOX_CLOCKTICKS
+BBOX0C0 BBOX_CLOCKTICKS
+BBOX1C0 BBOX_CLOCKTICKS
+WBOX0 WBOX_CLOCKTICKS
+PBOX0 PBOX_CLOCKTICKS
+RBOX0C0 RBOX_CLOCKTICKS
+RBOX1C0 RBOX_CLOCKTICKS
+RBOX2C0 RBOX_CLOCKTICKS
+IBOX0 IBOX_CLOCKTICKS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CBOX0 Frequency [GHz] 1.E-09*CBOX0C0/(FIXC1*inverseClock)
+CBOX1 Frequency [GHz] 1.E-09*CBOX1C0/(FIXC1*inverseClock)
+CBOX2 Frequency [GHz] 1.E-09*CBOX2C0/(FIXC1*inverseClock)
+CBOX3 Frequency [GHz] 1.E-09*CBOX3C0/(FIXC1*inverseClock)
+CBOX4 Frequency [GHz] 1.E-09*CBOX4C0/(FIXC1*inverseClock)
+CBOX5 Frequency [GHz] 1.E-09*CBOX5C0/(FIXC1*inverseClock)
+CBOX6 Frequency [GHz] 1.E-09*CBOX6C0/(FIXC1*inverseClock)
+CBOX7 Frequency [GHz] 1.E-09*CBOX7C0/(FIXC1*inverseClock)
+CBOX8 Frequency [GHz] 1.E-09*CBOX8C0/(FIXC1*inverseClock)
+CBOX9 Frequency [GHz] 1.E-09*CBOX9C0/(FIXC1*inverseClock)
+CBOX10 Frequency [GHz] 1.E-09*CBOX10C0/(FIXC1*inverseClock)
+CBOX11 Frequency [GHz] 1.E-09*CBOX11C0/(FIXC1*inverseClock)
+CBOX12 Frequency [GHz] 1.E-09*CBOX12C0/(FIXC1*inverseClock)
+CBOX13 Frequency [GHz] 1.E-09*CBOX13C0/(FIXC1*inverseClock)
+CBOX14 Frequency [GHz] 1.E-09*CBOX14C0/(FIXC1*inverseClock)
+MBOX0 Frequency [GHz] 1.E-09*MBOX0C0/(FIXC1*inverseClock)
+MBOX0FIX Frequency [GHz] 1.E-09*MBOX0FIX/(FIXC1*inverseClock)
+MBOX1 Frequency [GHz] 1.E-09*MBOX1C0/(FIXC1*inverseClock)
+MBOX1FIX Frequency [GHz] 1.E-09*MBOX1FIX/(FIXC1*inverseClock)
+MBOX2 Frequency [GHz] 1.E-09*MBOX2C0/(FIXC1*inverseClock)
+MBOX2FIX Frequency [GHz] 1.E-09*MBOX2FIX/(FIXC1*inverseClock)
+MBOX3 Frequency [GHz] 1.E-09*MBOX3C0/(FIXC1*inverseClock)
+MBOX3FIX Frequency [GHz] 1.E-09*MBOX3FIX/(FIXC1*inverseClock)
+SBOX0 Frequency [GHz] 1.E-09*SBOX0C0/(FIXC1*inverseClock)
+SBOX1 Frequency [GHz] 1.E-09*SBOX1C0/(FIXC1*inverseClock)
+SBOX2 Frequency [GHz] 1.E-09*SBOX2C0/(FIXC1*inverseClock)
+UBOX Frequency [GHz] 1.E-09*UBOXFIX/(FIXC1*inverseClock)
+BBOX0 Frequency [GHz] 1.E-09*BBOX0C0/(FIXC1*inverseClock)
+BBOX1 Frequency [GHz] 1.E-09*BBOX1C0/(FIXC1*inverseClock)
+WBOX Frequency [GHz] 1.E-09*WBOX0/(FIXC1*inverseClock)
+PBOX Frequency [GHz] 1.E-09*PBOX0/(FIXC1*inverseClock)
+RBOX0 Frequency [GHz] 1.E-09*RBOX0C0/(FIXC1*inverseClock)
+RBOX1 Frequency [GHz] 1.E-09*RBOX1C0/(FIXC1*inverseClock)
+RBOX2 Frequency [GHz] 1.E-09*RBOX2C0/(FIXC1*inverseClock)
+IBOX Frequency [GHz] 1.E-09*IBOX0/(FIXC1*inverseClock)
+
+
+LONG
+Formulas:
diff --git a/groups/ivybridgeEP/UOPS.txt b/groups/ivybridgeEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/ivybridgeEP/UOPS_EXEC.txt b/groups/ivybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/UOPS_ISSUE.txt b/groups/ivybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/ivybridgeEP/UOPS_RETIRE.txt b/groups/ivybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/ivybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/k10/BRANCH.txt b/groups/k10/BRANCH.txt
index cbc6da6..5c4207e 100644
--- a/groups/k10/BRANCH.txt
+++ b/groups/k10/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  INSTRUCTIONS_RETIRED
 PMC1  BRANCH_RETIRED
 PMC2  BRANCH_MISPREDICT_RETIRED
-PMC3  BRANCH_TAKEN_RETIRED
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/k10/CACHE.txt b/groups/k10/CACHE.txt
index e70823e..26d799f 100644
--- a/groups/k10/CACHE.txt
+++ b/groups/k10/CACHE.txt
@@ -8,26 +8,26 @@ PMC3  DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
-Data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
-Data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
 AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k10/CPI.txt b/groups/k10/CPI.txt
index 6595c2d..850afed 100644
--- a/groups/k10/CPI.txt
+++ b/groups/k10/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/k10/FLOPS_DP.txt b/groups/k10/FLOPS_DP.txt
index 4eccf8b..aa05d77 100644
--- a/groups/k10/FLOPS_DP.txt
+++ b/groups/k10/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 PMC0  SSE_RETIRED_ADD_DOUBLE_FLOPS
@@ -8,15 +8,17 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC2*inverseClock
-DP MFlops/s    1.0E-06*(PMC0+PMC1)/time
-DP Add MFlops/s    1.0E-06*PMC0/time
-DP Mult MFlops/s    1.0E-06*PMC1/time
+DP MFLOP/s    1.0E-06*(PMC0+PMC1)/time
+DP Add MFLOP/s    1.0E-06*PMC0/time
+DP Mult MFLOP/s    1.0E-06*PMC1/time
 
 LONG
 Formulas:
-DP MFlops/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time
+DP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
 -
-Profiling group to measure double SSE flops.
-Dont forget that your code might also execute X87 flops.
+Profiling group to measure double SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
 
 
diff --git a/groups/k10/FLOPS_SP.txt b/groups/k10/FLOPS_SP.txt
index 7a0bd52..8869557 100644
--- a/groups/k10/FLOPS_SP.txt
+++ b/groups/k10/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 PMC0  SSE_RETIRED_ADD_SINGLE_FLOPS
@@ -8,15 +8,17 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC2*inverseClock
-SP MFlops/s  1.0E-06*(PMC0+PMC1)/time
-SP Add MFlops/s  1.0E-06*PMC0/time
-SP Mult MFlops/s   1.0E-06*PMC1/time
+SP MFLOP/s  1.0E-06*(PMC0+PMC1)/time
+SP Add MFLOP/s  1.0E-06*PMC0/time
+SP Mult MFLOP/s   1.0E-06*PMC1/time
 
 LONG
 Formulas:
-SP MFlops/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time
+SP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time
 -
-Profiling group to measure single precision SSE flops.
-Dont forget that your code might also execute X87 flops.
+Profiling group to measure single precision SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
 
 
diff --git a/groups/k10/FLOPS_X87.txt b/groups/k10/FLOPS_X87.txt
index 9a585b4..015ee19 100644
--- a/groups/k10/FLOPS_X87.txt
+++ b/groups/k10/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 PMC0  X87_FLOPS_RETIRED_ADD
@@ -9,11 +9,17 @@ PMC3  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC3*inverseClock
-X87 MFlops/s       1.0E-06*(PMC0+PMC1+PMC2)/time
-X87 Add MFlops/s    1.0E-06*PMC0/time
-X87 Mult MFlops/s   1.0E-06*PMC1/time
-X87 Div MFlops/s    1.0E-06*PMC2/time
+X87 MFLOP/s       1.0E-06*(PMC0+PMC1+PMC2)/time
+X87 Add MFLOP/s    1.0E-06*PMC0/time
+X87 Mult MFLOP/s   1.0E-06*PMC1/time
+X87 Div MFLOP/s    1.0E-06*PMC2/time
 
 LONG
-Profiling group to measure X87 flop rates.
+Formulas:
+X87 MFLOP/s = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time
+X87 Add MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_ADD/time
+X87 Mult MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_MULT/time
+X87 Div MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_DIV/time
+-
+Profiling group to measure X87 FLOP rates.
 
diff --git a/groups/k10/FPU_EXCEPTION.txt b/groups/k10/FPU_EXCEPTION.txt
index eff87fc..23d3c54 100644
--- a/groups/k10/FPU_EXCEPTION.txt
+++ b/groups/k10/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
 Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
 FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
 -
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
 There might be a large penalty if there are too many floating point
 exceptions.
 
diff --git a/groups/k10/ICACHE.txt b/groups/k10/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k10/ICACHE.txt
+++ b/groups/k10/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  ICACHE_REFILLS_MEM
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC2+PMC3
-Instruction cache request rate   PMC1/PMC0
-Instruction cache miss rate    (PMC2+PMC3)/PMC0
-Instruction cache miss ratio   (PMC2+PMC3)/PMC1
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate  (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/k10/L2.txt b/groups/k10/L2.txt
index 8b61bcc..fae6fb0 100644
--- a/groups/k10/L2.txt
+++ b/groups/k10/L2.txt
@@ -8,21 +8,25 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
-L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_L2_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1. 
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
 Note that this bandwidth also includes data transfers due to a
 write allocate load on a store miss in L1 and copy back transfers if
 originated from L2.
diff --git a/groups/k10/L2CACHE.txt b/groups/k10/L2CACHE.txt
index d384c48..2d29e43 100644
--- a/groups/k10/L2CACHE.txt
+++ b/groups/k10/L2CACHE.txt
@@ -19,13 +19,13 @@ L2 miss rate  = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
 L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL+L2_FILL_ALL)
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
 AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
 
diff --git a/groups/k10/L3CACHE.txt b/groups/k10/L3CACHE.txt
index 85b4522..e3a2d72 100644
--- a/groups/k10/L3CACHE.txt
+++ b/groups/k10/L3CACHE.txt
@@ -13,20 +13,20 @@ L3 miss ratio  PMC2/PMC1
 
 LONG
 Formulas:
-L3 request rate =  L3_READ_REQUEST_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss rate  = L3_MISSES_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss ratio =  L3_MISSES_ALL_ALL_CORES / L3_READ_REQUEST_ALL_ALL_CORES
+L3 request rate =  L3_READ_REQUEST_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss rate = L3_MISSES_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss ratio =  L3_MISSES_ALL_ALL_CORES/L3_READ_REQUEST_ALL_ALL_CORES
 -
 This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
 This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
 AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
 
diff --git a/groups/k10/MEM.txt b/groups/k10/MEM.txt
index b6c9f33..f9f5a91 100644
--- a/groups/k10/MEM.txt
+++ b/groups/k10/MEM.txt
@@ -8,19 +8,28 @@ PMC3  DRAM_ACCESSES_DCT1_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Read data bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
-Write data bandwidth [MBytes/s]  1.0E-06*PMC1*8.0/time
+Memory read bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+Memory read data volume [GBytes]  1.0E-09*PMC0*64.0
+Memory write bandwidth [MBytes/s]  1.0E-06*PMC1*8.0/time
+Memory write data volume [GBytes]  1.0E-09*PMC1*8.0
 Memory bandwidth [MBytes/s]   1.0E-06*(PMC2+PMC3)*64.0/time
 Memory data volume [GBytes]   1.0E-09*(PMC2+PMC3)*64.0
 
 LONG
 Formulas:
-Read data bandwidth (MBytes/s)  1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
-Write data bandwidth (MBytes/s)  1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
+Memory read data volume [GBytes] = 1.0E-09*NORTHBRIDGE_READ_RESPONSE_ALL*64
+Memory write bandwidth [MBytes/s] = 1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Memory write data volume [GBytes] = 1.0E-09*OCTWORDS_WRITE_TRANSFERS*8
 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
 Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
+The memory read bandwidth contains all data from DRAM, L3, or another cache,
+including another core on the same node. The event OCTWORDS_WRITE_TRANSFERS counts
+16 Byte transfers, not 64 Byte.
+
+
 
diff --git a/groups/k10/NUMA.txt b/groups/k10/NUMA.txt
deleted file mode 100644
index 9734e3c..0000000
--- a/groups/k10/NUMA.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-SHORT Bandwidth on the Hypertransport links
-
-EVENTSET
-PMC0  CPU_TO_DRAM_LOCAL_TO_0
-PMC1  CPU_TO_DRAM_LOCAL_TO_1
-PMC2  CPU_TO_DRAM_LOCAL_TO_2
-PMC3  CPU_TO_DRAM_LOCAL_TO_3
-
-METRICS
-Runtime (RDTSC) [s] time
-Mega requests per second to Node 0   1.0E-06*PMC0/time
-Mega requests per second to Node 1   1.0E-06*PMC1/time
-Mega requests per second to Node 2   1.0E-06*PMC2/time
-Mega requests per second to Node 3   1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Mega requests per second to Node X   1.0E-06*PMCX/time
--
-Profiling group to measure the traffic from local CPU to the different 
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded 
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/k10/NUMA2.txt b/groups/k10/NUMA2.txt
deleted file mode 100644
index dbfbbb0..0000000
--- a/groups/k10/NUMA2.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Bandwidth on the Hypertransport links
-
-EVENTSET
-PMC0  CPU_TO_DRAM_LOCAL_TO_4
-PMC1  CPU_TO_DRAM_LOCAL_TO_5
-PMC2  CPU_TO_DRAM_LOCAL_TO_6
-PMC3  CPU_TO_DRAM_LOCAL_TO_7
-
-METRICS
-Runtime (RDTSC) [s] time
-Hyper Transport link0 bandwidth (MBytes/s)  1.0E-06*PMC0*4.0/time
-Hyper Transport link1 bandwidth (MBytes/s)  1.0E-06*PMC1*4.0/time
-Hyper Transport link2 bandwidth (MBytes/s)  1.0E-06*PMC2*4.0/time
-Hyper Transport link3 bandwidth (MBytes/s)  1.0E-06*PMC3*4.0/time
-
-LONG
-Formulas:
-Hyper Transport linkn bandwidth (MBytes/s)  1.0E-06*HYPERTRANSPORT_LINK0_ALL_SENT*4.0/time
--
-Profiling group to measure the bandwidth over the Hypertransport links. Can be used
-to detect NUMA problems. Usually there should be only limited traffic over the QPI 
-links for optimal performance.
-
-
diff --git a/groups/k10/NUMA_0_3.txt b/groups/k10/NUMA_0_3.txt
new file mode 100644
index 0000000..bdda6e0
--- /dev/null
+++ b/groups/k10/NUMA_0_3.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0  CPU_TO_DRAM_LOCAL_TO_0
+PMC1  CPU_TO_DRAM_LOCAL_TO_1
+PMC2  CPU_TO_DRAM_LOCAL_TO_2
+PMC3  CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link0 bandwidth [MBytes/s]  1.0E-06*PMC0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s]  1.0E-06*PMC1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s]  1.0E-06*PMC2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link0 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/groups/k10/NUMA_4_7.txt b/groups/k10/NUMA_4_7.txt
new file mode 100644
index 0000000..aa10be0
--- /dev/null
+++ b/groups/k10/NUMA_4_7.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0  CPU_TO_DRAM_LOCAL_TO_4
+PMC1  CPU_TO_DRAM_LOCAL_TO_5
+PMC2  CPU_TO_DRAM_LOCAL_TO_6
+PMC3  CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link4 bandwidth [MBytes/s]  1.0E-06*PMC0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s]  1.0E-06*PMC1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s]  1.0E-06*PMC2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link4 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/groups/k10/TLB.txt b/groups/k10/TLB.txt
index 2984491..2491c8d 100644
--- a/groups/k10/TLB.txt
+++ b/groups/k10/TLB.txt
@@ -26,10 +26,10 @@ L2 DTLB miss rate  DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED
 L2 DTLB miss ratio DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)
 -
 L1 DTLB request  rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss  rate gives a measure how often a TLB miss occured
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
 per instruction. And finally L1 DTLB  miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
 NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate!
 This group was taken from the whitepaper Basic -Performance Measurements for AMD Athlon 64,
 AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k8/BRANCH.txt b/groups/k8/BRANCH.txt
index 64e10cd..f465335 100644
--- a/groups/k8/BRANCH.txt
+++ b/groups/k8/BRANCH.txt
@@ -4,28 +4,22 @@ EVENTSET
 PMC0  INSTRUCTIONS_RETIRED
 PMC1  BRANCH_RETIRED
 PMC2  BRANCH_MISPREDICT_RETIRED
-PMC3  BRANCH_TAKEN_RETIRED
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
diff --git a/groups/k8/CACHE.txt b/groups/k8/CACHE.txt
index ff20b5e..e5e813e 100644
--- a/groups/k8/CACHE.txt
+++ b/groups/k8/CACHE.txt
@@ -8,26 +8,26 @@ PMC3  DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
-Data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
-Data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
 AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/groups/k8/CPI.txt b/groups/k8/CPI.txt
index 6595c2d..850afed 100644
--- a/groups/k8/CPI.txt
+++ b/groups/k8/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/k8/ICACHE.txt b/groups/k8/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k8/ICACHE.txt
+++ b/groups/k8/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  ICACHE_REFILLS_MEM
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC2+PMC3
-Instruction cache request rate   PMC1/PMC0
-Instruction cache miss rate    (PMC2+PMC3)/PMC0
-Instruction cache miss ratio   (PMC2+PMC3)/PMC1
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate  (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/k8/L2.txt b/groups/k8/L2.txt
index 58eae3b..c3ad517 100644
--- a/groups/k8/L2.txt
+++ b/groups/k8/L2.txt
@@ -21,8 +21,8 @@ L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
 L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1. 
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
 Note that this bandwidth also includes data transfers due to a
 write allocate load on a store miss in L1 and copy back transfers if
 originated from L2.
diff --git a/groups/kabini/BRANCH.txt b/groups/kabini/BRANCH.txt
index 1ae9f36..7495b74 100644
--- a/groups/kabini/BRANCH.txt
+++ b/groups/kabini/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
 PMC1  RETIRED_BRANCH_INSTR
 PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3  RETIRED_TAKEN_BRANCH_INSTR
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
 -
-The rates state how often in average a branch or a mispredicted branch occured
+The rates state how often on average a branch or a mispredicted branch occurred
 per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/kabini/CACHE.txt b/groups/kabini/CACHE.txt
index ef62f76..8a59288 100644
--- a/groups/kabini/CACHE.txt
+++ b/groups/kabini/CACHE.txt
@@ -8,25 +8,25 @@ PMC3  DATA_CACHE_REFILLS_NB_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Data cache misses PMC2+PMC3
-Data cache request rate PMC1/PMC0
-Data cache miss rate (PMC2+PMC3)/PMC0
-Data cache miss ratio (PMC2+PMC3)/PMC1
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL
-Data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-Data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS
-Data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES
+data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 
diff --git a/groups/kabini/CPI.txt b/groups/kabini/CPI.txt
index 47711f4..c0746e7 100644
--- a/groups/kabini/CPI.txt
+++ b/groups/kabini/CPI.txt
@@ -13,6 +13,11 @@ CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
diff --git a/groups/kabini/DATA.txt b/groups/kabini/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/kabini/DATA.txt
+++ b/groups/kabini/DATA.txt
@@ -6,11 +6,11 @@ PMC1  LS_DISPATCH_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
 -
 This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/kabini/FLOPS_DP.txt b/groups/kabini/FLOPS_DP.txt
index d7f5f57..d6af2e2 100644
--- a/groups/kabini/FLOPS_DP.txt
+++ b/groups/kabini/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
@@ -9,15 +9,18 @@ PMC3  RETIRED_FLOPS_DOUBLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+DP MFLOP/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
 -
-Profiling group to measure double precisision flop rate.
+Profiling group to measure double precisision FLOP rate.
 
 
diff --git a/groups/kabini/FLOPS_SP.txt b/groups/kabini/FLOPS_SP.txt
index 1c4dcc3..0fe4e54 100644
--- a/groups/kabini/FLOPS_SP.txt
+++ b/groups/kabini/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
@@ -9,15 +9,18 @@ PMC3  RETIRED_FLOPS_SINGLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+SP MFLOP/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
 -
-Profiling group to measure single precision flop rate.
+Profiling group to measure single precision FLOP rate.
 
 
diff --git a/groups/kabini/FPU_EXCEPTION.txt b/groups/kabini/FPU_EXCEPTION.txt
index 23814da..5ed02c6 100644
--- a/groups/kabini/FPU_EXCEPTION.txt
+++ b/groups/kabini/FPU_EXCEPTION.txt
@@ -15,7 +15,7 @@ Formulas:
 Overall FP exception rate = FPU_EXCEPTIONS_ALL / RETIRED_INSTRUCTIONS
 FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
 -
-Floating point exceptions occur e.g. on the treatment of Denormals.
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
 There might be a large penalty if there are too many floating point
 exceptions.
 
diff --git a/groups/kabini/ICACHE.txt b/groups/kabini/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/kabini/ICACHE.txt
+++ b/groups/kabini/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  RETIRED_INSTRUCTIONS
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC1+PMC2
-Instruction cache request rate   PMC0/PMC3
-Instruction cache miss rate    (PMC1+PMC2)/PMC3
-Instruction cache miss ratio   (PMC1+PMC2)/PMC0
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
 
 LONG
 Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate  (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/kabini/L2.txt b/groups/kabini/L2.txt
index d06d809..3598a54 100644
--- a/groups/kabini/L2.txt
+++ b/groups/kabini/L2.txt
@@ -8,21 +8,25 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline loaded from L2 to L1 and the
-number of modified cachelines evicted from the L1. 
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
 Note that this bandwidth also includes data transfers due to a
 write allocate load on a store miss in L1 and copy back transfers if
 originated from L2.
diff --git a/groups/kabini/MEM.txt b/groups/kabini/MEM.txt
index 22aa19e..2fa9dfe 100644
--- a/groups/kabini/MEM.txt
+++ b/groups/kabini/MEM.txt
@@ -16,5 +16,5 @@ Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Note: As this group measures the accesses from all cores it only makes sense
-to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
 
diff --git a/groups/kabini/NUMA.txt b/groups/kabini/NUMA.txt
deleted file mode 100644
index d94e735..0000000
--- a/groups/kabini/NUMA.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT Read/Write Events between the ccNUMA nodes
-
-EVENTSET
-UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
-UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
-UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
-UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
-
-METRICS
-Runtime (RDTSC) [s] time
-DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
-DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
-DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
-DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
-
-LONG
-Formulas:
-DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
--
-Profiling group to measure the traffic from local CPU to the different 
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded 
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/kabini/NUMA2.txt b/groups/kabini/NUMA2.txt
deleted file mode 100644
index b10e6fb..0000000
--- a/groups/kabini/NUMA2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT Read/Write Events between the ccNUMA nodes
-
-EVENTSET
-UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_4
-UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_5
-UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_6
-UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_7
-
-METRICS
-Runtime (RDTSC) [s] time
-DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UPMC0/time
-DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UPMC1/time
-DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UPMC2/time
-DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
-
-LONG
-Formulas:
-DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
-DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
-DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
-DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
--
-Profiling group to measure the traffic from local CPU to the different 
-DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded 
-code. You must first determine on which memory domains your code is running.
-A code should only have significant traffic to its own memory domain.
-
-
diff --git a/groups/kabini/NUMA_0_3.txt b/groups/kabini/NUMA_0_3.txt
new file mode 100644
index 0000000..ed13dbe
--- /dev/null
+++ b/groups/kabini/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/kabini/NUMA_4_7.txt b/groups/kabini/NUMA_4_7.txt
new file mode 100644
index 0000000..b744881
--- /dev/null
+++ b/groups/kabini/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/groups/kabini/TLB.txt b/groups/kabini/TLB.txt
index 4f170ee..707f888 100644
--- a/groups/kabini/TLB.txt
+++ b/groups/kabini/TLB.txt
@@ -26,8 +26,9 @@ L2 DTLB miss rate  DTLB_MISS_ALL / RETIRED_INSTRUCTIONS
 L2 DTLB miss ratio DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)
 -
 L1 DTLB request  rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss  rate gives a measure how often a TLB miss occured
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
 per instruction. And finally L1 DTLB  miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
-NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate!
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/groups/nehalem/BRANCH.txt b/groups/nehalem/BRANCH.txt
index 3d81416..1ef9f11 100644
--- a/groups/nehalem/BRANCH.txt
+++ b/groups/nehalem/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/nehalem/CACHE.txt b/groups/nehalem/CACHE.txt
index c3e989c..6603171 100644
--- a/groups/nehalem/CACHE.txt
+++ b/groups/nehalem/CACHE.txt
@@ -12,24 +12,25 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
 
 LONG
 Formulas:
-Data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
+data cache misses = L1D_REPL
+data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 
diff --git a/groups/nehalem/DATA.txt b/groups/nehalem/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/nehalem/DATA.txt
+++ b/groups/nehalem/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/nehalem/FLOPS_DP.txt b/groups/nehalem/FLOPS_DP.txt
index c5ba91c..3e75cad 100644
--- a/groups/nehalem/FLOPS_DP.txt
+++ b/groups/nehalem/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/nehalem/FLOPS_SP.txt b/groups/nehalem/FLOPS_SP.txt
index 4478c8f..9768109 100644
--- a/groups/nehalem/FLOPS_SP.txt
+++ b/groups/nehalem/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/nehalem/FLOPS_X87.txt b/groups/nehalem/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/nehalem/FLOPS_X87.txt
+++ b/groups/nehalem/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/nehalem/ICACHE.txt b/groups/nehalem/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalem/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalem/L2.txt b/groups/nehalem/L2.txt
index d193047..e2715cc 100644
--- a/groups/nehalem/L2.txt
+++ b/groups/nehalem/L2.txt
@@ -6,27 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the 
-number of modified cachelines evicted from the L1. 
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
 Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
 
diff --git a/groups/nehalem/L2CACHE.txt b/groups/nehalem/L2CACHE.txt
index 0fd60da..343b263 100644
--- a/groups/nehalem/L2CACHE.txt
+++ b/groups/nehalem/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,17 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_MESI
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/nehalem/L3.txt b/groups/nehalem/L3.txt
index 446afee..70b5f29 100644
--- a/groups/nehalem/L3.txt
+++ b/groups/nehalem/L3.txt
@@ -12,20 +12,24 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
+number of cache line allocated in the L2 and the number of modified cache lines
 evicted from the L2. Also reports total data volume between L3 and L2 caches.
 Note that this bandwidth also includes data transfers due to a write allocate
 load on a store miss in L2.
diff --git a/groups/nehalem/L3CACHE.txt b/groups/nehalem/L3CACHE.txt
index b6ec110..15e00ed 100644
--- a/groups/nehalem/L3CACHE.txt
+++ b/groups/nehalem/L3CACHE.txt
@@ -1,36 +1,34 @@
 SHORT L3 cache miss rate/ratio
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_L3_HITS_ANY
 UPMC1  UNC_L3_MISS_ANY
-UPMC2  UNC_L3_LINES_IN_ANY
-UPMC3  UNC_L3_LINES_OUT_ANY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate   UPMC0/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
 L3 miss rate   UPMC1/FIXC0
-L3 miss ratio  UPMC1/UPMC0
+L3 miss ratio  UPMC1/(UPMC0+UPMC1)
 
 LONG
 Formulas:
-L3 request rate  UNC_L3_HITS_ANY / INSTR_RETIRED_ANY 
-L3 miss rate   UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio  UNC_L3_MISS_ANY / UNC_L3_HITS_ANY
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
 -
 This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/nehalem/MEM.txt b/groups/nehalem/MEM.txt
index 087b269..d2083f5 100644
--- a/groups/nehalem/MEM.txt
+++ b/groups/nehalem/MEM.txt
@@ -1,36 +1,49 @@
 SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_QMC_NORMAL_READS_ANY
 UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS 
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES 
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
 
 LONG
 Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-This group will be measured by one core per socket. The Remote  Read BW  tells
-you if cachelines are transfered between sockets, meaning that cores access
+This group will be measured by one core per socket. The Remote Read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
 data owned by a remote NUMA domain.
 
diff --git a/groups/nehalem/SCHEDULER.txt b/groups/nehalem/SCHEDULER.txt
index a7bbe37..0e43cce 100644
--- a/groups/nehalem/SCHEDULER.txt
+++ b/groups/nehalem/SCHEDULER.txt
@@ -13,9 +13,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-RATIO Port 1  PMC1/PMC0
-RATIO Port 5  PMC2/PMC0
+Ratio Port 1  PMC1/PMC0
+Ratio Port 5  PMC2/PMC0
 
 LONG
+Formulas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
 Measures how many instructions were scheduled on which issue port.
 
diff --git a/groups/nehalem/TLB.txt b/groups/nehalem/TLB.txt
index 5f93d66..c380851 100644
--- a/groups/nehalem/TLB.txt
+++ b/groups/nehalem/TLB.txt
@@ -22,9 +22,9 @@ L1 DTLB request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
 DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
 L1 DTLB miss ratio  =   DTLB_MISSES_ANY / L1D_ALL_REF_ANY
 -
-L1 DTLB request  rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. And finally L1 DTLB  miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
 
diff --git a/groups/nehalem/VIEW.txt b/groups/nehalem/VIEW.txt
deleted file mode 100644
index 98a856f..0000000
--- a/groups/nehalem/VIEW.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-UPMC0  UNC_QMC_NORMAL_READS_ANY
-UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-DP MFlops/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
-
-LONG
-Formulas:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-Packed MUOPS/s   1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
-Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
-SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
-DP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
--
-This is a overview group using the capabilities of nehalem to measure multiple events at
-the same time.
-
diff --git a/groups/nehalemEX/BRANCH.txt b/groups/nehalemEX/BRANCH.txt
index 3d81416..1ef9f11 100644
--- a/groups/nehalemEX/BRANCH.txt
+++ b/groups/nehalemEX/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/nehalemEX/CACHE.txt b/groups/nehalemEX/CACHE.txt
index c3e989c..6603171 100644
--- a/groups/nehalemEX/CACHE.txt
+++ b/groups/nehalemEX/CACHE.txt
@@ -12,24 +12,25 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Data cache misses PMC0
-Data cache request rate PMC1/FIXC0
-Data cache miss rate PMC0/FIXC0
-Data cache miss ratio PMC0/PMC1
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
 
 LONG
 Formulas:
-Data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
-Data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
+data cache misses = L1D_REPL
+data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. Data cache request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy. And finally 
-Data cache miss ratio tells you how many of your memory references required
-a cacheline to be loaded from a higher level. While the Data cache miss rate 
-might be given by your algorithm you should try to get Data cache miss ratio
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
 as low as possible by increasing your cache reuse.
 
diff --git a/groups/nehalemEX/DATA.txt b/groups/nehalemEX/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/nehalemEX/DATA.txt
+++ b/groups/nehalemEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/nehalemEX/FLOPS_DP.txt b/groups/nehalemEX/FLOPS_DP.txt
index c5ba91c..3e75cad 100644
--- a/groups/nehalemEX/FLOPS_DP.txt
+++ b/groups/nehalemEX/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/nehalemEX/FLOPS_SP.txt b/groups/nehalemEX/FLOPS_SP.txt
index 4478c8f..9768109 100644
--- a/groups/nehalemEX/FLOPS_SP.txt
+++ b/groups/nehalemEX/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/nehalemEX/FLOPS_X87.txt b/groups/nehalemEX/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/nehalemEX/FLOPS_X87.txt
+++ b/groups/nehalemEX/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/nehalemEX/ICACHE.txt b/groups/nehalemEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalemEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalemEX/L2.txt b/groups/nehalemEX/L2.txt
index 2734c5d..e2715cc 100644
--- a/groups/nehalemEX/L2.txt
+++ b/groups/nehalemEX/L2.txt
@@ -6,28 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L1 and the 
-number of modified cachelines evicted from the L1.  Also reports on
-total data volume transfered between L2 and L1 cache.
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
 Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
 
diff --git a/groups/nehalemEX/L2CACHE.txt b/groups/nehalemEX/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/nehalemEX/L2CACHE.txt
+++ b/groups/nehalemEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/nehalemEX/L3.txt b/groups/nehalemEX/L3.txt
new file mode 100644
index 0000000..51a0811
--- /dev/null
+++ b/groups/nehalemEX/L3.txt
@@ -0,0 +1,37 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ANY
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+PMC2  L2_LINES_OUT_PREFETCH_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*(PMC1+PMC2)*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*(PMC1+PMC2)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. Also reports total data volume between L3 and L2 caches.
+Note that this bandwidth also includes data transfers due to a write allocate
+load on a store miss in L2.
+
diff --git a/groups/nehalemEX/L3CACHE.txt b/groups/nehalemEX/L3CACHE.txt
new file mode 100644
index 0000000..c6b204e
--- /dev/null
+++ b/groups/nehalemEX/L3CACHE.txt
@@ -0,0 +1,48 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/nehalemEX/MEM.txt b/groups/nehalemEX/MEM.txt
index 86a2e97..510f27b 100644
--- a/groups/nehalemEX/MEM.txt
+++ b/groups/nehalemEX/MEM.txt
@@ -1,39 +1,42 @@
 SHORT Main memory bandwidth
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-WBOX4 UNCORE_CYCLES
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK 
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK 
-BBOX0C1 IMT_INSERTS_WR 
-BBOX1C1 IMT_INSERTS_WR 
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+WBOXFIX UNCORE_CLOCKTICKS
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
 
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-Uncore Clock [MHz]  1.E-06*(WBOX4)/time
+Uncore Clock [MHz]  1.E-06*(WBOXFIX)/time
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64
 
 LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
 
diff --git a/groups/nehalemEX/SCHEDULER.txt b/groups/nehalemEX/SCHEDULER.txt
index a7bbe37..237fcb8 100644
--- a/groups/nehalemEX/SCHEDULER.txt
+++ b/groups/nehalemEX/SCHEDULER.txt
@@ -13,9 +13,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-RATIO Port 1  PMC1/PMC0
-RATIO Port 5  PMC2/PMC0
+Ratio Port 1  PMC1/PMC0
+Ratio Port 5  PMC2/PMC0
 
 LONG
+Forumlas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
 Measures how many instructions were scheduled on which issue port.
 
diff --git a/groups/nehalemEX/TLB.txt b/groups/nehalemEX/TLB.txt
index 5f93d66..0e358b8 100644
--- a/groups/nehalemEX/TLB.txt
+++ b/groups/nehalemEX/TLB.txt
@@ -22,9 +22,9 @@ L1 DTLB request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
 DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
 L1 DTLB miss ratio  =   DTLB_MISSES_ANY / L1D_ALL_REF_ANY
 -
-L1 DTLB request  rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
-The DTLB miss  rate gives a measure how often a TLB miss occured
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
 per instruction. And finally L1 DTLB  miss ratio tells you how many
-of your memory references required caused a TLB miss in average.
+of your memory references required caused a TLB miss on average.
 
diff --git a/groups/pentiumm/BRANCH.txt b/groups/pentiumm/BRANCH.txt
new file mode 100644
index 0000000..157c331
--- /dev/null
+++ b/groups/pentiumm/BRANCH.txt
@@ -0,0 +1,17 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  BR_INST_EXEC
+PMC1  BR_INST_MISSP_EXEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch misprediction ratio  PMC1/PMC0
+
+LONG
+Formulas:
+Branch misprediction ratio = BR_INST_MISSP_EXEC / BR_INST_EXEC
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
diff --git a/groups/pentiumm/CPI.txt b/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..1df7ff8
--- /dev/null
+++ b/groups/pentiumm/CPI.txt
@@ -0,0 +1,22 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  UOPS_RETIRED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI   CPU_CLK_UNHALTED/UOPS_RETIRED
+IPC   UOPS_RETIRED/CPU_CLK_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/groups/pentiumm/FLOPS_DP.txt b/groups/pentiumm/FLOPS_DP.txt
new file mode 100644
index 0000000..976c44c
--- /dev/null
+++ b/groups/pentiumm/FLOPS_DP.txt
@@ -0,0 +1,20 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed MUOPS/s   1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
+Packed MUOPS/s = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time
+Scalar MUOPS/s = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/pentiumm/FLOPS_SP.txt b/groups/pentiumm/FLOPS_SP.txt
new file mode 100644
index 0000000..83b73f2
--- /dev/null
+++ b/groups/pentiumm/FLOPS_SP.txt
@@ -0,0 +1,18 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFLOP/s  1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*(PMC1)/time
+
+LONG
+Formula:
+MFLOP/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
+Scalar MUOPS/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/pentiumm/L3.txt b/groups/pentiumm/L3.txt
new file mode 100644
index 0000000..2ed5293
--- /dev/null
+++ b/groups/pentiumm/L3.txt
@@ -0,0 +1,30 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  L2_LINES_IN_ALL_ALL
+PMC1  L2_LINES_OUT_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ALL_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ALL_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. The group also output total data volume transferred between
+L2. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L2.
+
diff --git a/groups/phi/CACHE.txt b/groups/phi/CACHE.txt
index d611965..01ac5e4 100644
--- a/groups/phi/CACHE.txt
+++ b/groups/phi/CACHE.txt
@@ -1,4 +1,4 @@
-SHORT  Compute to Data Access Ratio
+SHORT L1 compute to data access ratio
 
 EVENTSET
 PMC0  VPU_ELEMENTS_ACTIVE
@@ -8,12 +8,15 @@ METRICS
 Runtime (RDTSC) [s] time
 L1 compute intensity   PMC0/PMC1
 
-LONG 
+LONG
+Formulas:
+L1 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_OR_WRITE
+-
 These metric is a way to measure the computational density of an
 application, or how many computations it is performing on average for each
-piece of data loaded.  L1 Compute to Data Access Ratio, should be
+piece of data loaded. L1 compute to data access ratio should be
 used to judge suitability of an application for running on the Intel MIC
-Architecture. Applications that will perform well on the Intel� MIC
-Architecture should be vectorized, and ideally be able to perform multiple
-operations on the same pieces of data (or same cachelines).
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
 
diff --git a/groups/phi/COMPUTE_TO_DATA_RATIO.txt b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
new file mode 100644
index 0000000..6fdd008
--- /dev/null
+++ b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
@@ -0,0 +1,22 @@
+SHORT L2 compute to data access ratio
+
+EVENTSET
+PMC0  VPU_ELEMENTS_ACTIVE
+PMC1  DATA_READ_MISS_OR_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 compute intensity   PMC0/PMC1
+
+LONG
+Formulas:
+L2 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_MISS_OR_WRITE_MISS
+-
+These metric is a way to measure the computational density of an
+application, or how many computations it is performing on average for each
+piece of data loaded. L2 compute to data access ratio should be
+used to judge suitability of an application for running on the Intel MIC
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
+
diff --git a/groups/phi/CPI.txt b/groups/phi/CPI.txt
index 8d4cf36..f3d8b4e 100644
--- a/groups/phi/CPI.txt
+++ b/groups/phi/CPI.txt
@@ -11,6 +11,10 @@ CPI   PMC1/PMC0
 IPC   PMC0/PMC1
 
 LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED/INSTRUCTIONS_EXECUTED
+IPC = INSTRUCTIONS_EXECUTED/CPU_CLK_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
diff --git a/groups/phi/L2CACHE.txt b/groups/phi/L2CACHE.txt
deleted file mode 100644
index 228a5ba..0000000
--- a/groups/phi/L2CACHE.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-SHORT L2 Compute to Data Access Ratio
-
-EVENTSET
-PMC0  VPU_ELEMENTS_ACTIVE
-PMC1  DATA_READ_MISS_OR_WRITE_MISS
-
-METRICS
-Runtime (RDTSC) [s] time
-L2 compute intensity   PMC0/PMC1
-
-LONG
-These metric is a way to measure the computational density of an
-application, or how many computations it is performing on average for each
-piece of data loaded.  L2 Compute to Data Access Ratio, should be
-used to judge suitability of an application for running on the Intel MIC
-Architecture. Applications that will perform well on the Intel� MIC
-Architecture should be vectorized, and ideally be able to perform multiple
-operations on the same pieces of data (or same cachelines).
-
diff --git a/groups/phi/MEM.txt b/groups/phi/MEM.txt
new file mode 100644
index 0000000..8899592
--- /dev/null
+++ b/groups/phi/MEM.txt
@@ -0,0 +1,18 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS_OR_WRITE_MISS
+PMC1  DATA_CACHE_LINES_WRITTEN_BACK
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0
+-
+Total memory bandwidth and data volume.
diff --git a/groups/phi/MEM1.txt b/groups/phi/MEM1.txt
index 16e44e0..c9f7fb6 100644
--- a/groups/phi/MEM1.txt
+++ b/groups/phi/MEM1.txt
@@ -1,13 +1,18 @@
-SHORT L2 Write Misses
+SHORT L2 write misses
 
 EVENTSET
 PMC0  L2_DATA_WRITE_MISS_MEM_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-RFO Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-RFO Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 RFO bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 RFO data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 RFO bandwidth [MBytes/s] = 1.0E-06*L2_DATA_WRITE_MISS_MEM_FILL*64.0/time
+L2 RFO data volume [GBytes] = 1.0E-09*L2_DATA_WRITE_MISS_MEM_FILL*64.0
+-
+Bandwidth and data volume fetched from memory due to a L2 data write miss. These
+fetches are commonly called write-allocate loads or read-for-ownership (RFO).
 
diff --git a/groups/phi/MEM2.txt b/groups/phi/MEM2.txt
index 9be1f2a..d44a823 100644
--- a/groups/phi/MEM2.txt
+++ b/groups/phi/MEM2.txt
@@ -1,13 +1,17 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
 
 EVENTSET
 PMC0  L2_DATA_READ_MISS_MEM_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_DATA_READ_MISS_MEM_FILL*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_DATA_READ_MISS_MEM_FILL*64.0
+-
+The data volume and bandwidth caused by read misses in the L2 cache.
 
diff --git a/groups/phi/MEM3.txt b/groups/phi/MEM3.txt
index 45ce0de..73de570 100644
--- a/groups/phi/MEM3.txt
+++ b/groups/phi/MEM3.txt
@@ -5,9 +5,13 @@ PMC0  HWP_L2MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Prefetch Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Prefetch Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Prefetch bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Prefetch data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Prefetch bandwidth [MBytes/s] = 1.0E-06*HWP_L2MISS*64.0/time
+Prefetch data volume [GBytes] = 1.0E-09*HWP_L2MISS*64.0
+-
+The bandwidth and data volume caused by L2 misses from the hardware prefetcher.
 
diff --git a/groups/phi/MEM4.txt b/groups/phi/MEM4.txt
index a861a8b..9e892bd 100644
--- a/groups/phi/MEM4.txt
+++ b/groups/phi/MEM4.txt
@@ -1,13 +1,17 @@
-SHORT L2 Victim requests
+SHORT L2 victom requests
 
 EVENTSET
 PMC0  L2_VICTIM_REQ_WITH_DATA
 
 METRICS
 Runtime (RDTSC) [s] time
-Victim Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Victim Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Victim bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Victim data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Victim bandwidth [MBytes/s] = 1.0E-06*L2_VICTIM_REQ_WITH_DATA*64.0/time
+Victim data volume [GBytes] = 1.0E-09*L2_VICTIM_REQ_WITH_DATA*64.0
+-
+Data volume and bandwidth caused by cache line victims.
 
diff --git a/groups/phi/MEM5.txt b/groups/phi/MEM5.txt
index ade9828..49acb98 100644
--- a/groups/phi/MEM5.txt
+++ b/groups/phi/MEM5.txt
@@ -1,13 +1,19 @@
-SHORT L2 Snoop hits
+SHORT L2 snoop hits
 
 EVENTSET
 PMC0  SNP_HITM_L2
 
 METRICS
 Runtime (RDTSC) [s] time
-Snoop Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Snoop Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Snoop bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Snoop data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Snoop bandwidth [MBytes/s] = 1.0E-06*SNP_HITM_L2*64.0/time
+Snoop data volume [GBytes] = 1.0E-09*SNP_HITM_L2*64.0
+-
+Snoop traffic caused by HITM requests. HITM requests are L2 requests that
+are served by another core's L2 cache but the remote cache line is in modified
+state.
 
diff --git a/groups/phi/MEM6.txt b/groups/phi/MEM6.txt
index 41be52e..835faf8 100644
--- a/groups/phi/MEM6.txt
+++ b/groups/phi/MEM6.txt
@@ -1,13 +1,17 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
 
 EVENTSET
 PMC0  L2_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-L2 Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_READ_MISS*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_READ_MISS*64.0
+-
+Data volume and bandwidth caused by read misses in the L2 cache.
 
diff --git a/groups/phi/MEM_READ.txt b/groups/phi/MEM_READ.txt
new file mode 100644
index 0000000..fb107b0
--- /dev/null
+++ b/groups/phi/MEM_READ.txt
@@ -0,0 +1,20 @@
+SHORT Memory read bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS
+PMC1  HWP_L2MISS
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0
+-
+Bandwidth and data volume of read operations from the memory to L2 cache. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/MEM_WRITE.txt b/groups/phi/MEM_WRITE.txt
new file mode 100644
index 0000000..01043fd
--- /dev/null
+++ b/groups/phi/MEM_WRITE.txt
@@ -0,0 +1,20 @@
+SHORT Memory write bandwidth
+
+EVENTSET
+PMC0  L2_VICTIM_REQ_WITH_DATA
+PMC1  SNP_HITM_L2
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory write bandwidth [MBytes/s] = 1.0E-06*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0
+-
+Bandwidth and data volume of write operations from the L2 cache to memory. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/PAIRING.txt b/groups/phi/PAIRING.txt
index 2e93cc8..ce3627c 100644
--- a/groups/phi/PAIRING.txt
+++ b/groups/phi/PAIRING.txt
@@ -6,8 +6,16 @@ PMC1  INSTRUCTIONS_EXECUTED_V_PIPE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPipeRatio   PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
 
 LONG
-Pairing ratio
+Formulas:
+V-pipe ratio = INSTRUCTIONS_EXECUTED_V_PIPE/INSTRUCTIONS_EXECUTED
+Pairing ratio = INSTRUCTIONS_EXECUTED_V_PIPE/(INSTRUCTIONS_EXECUTED-INSTRUCTIONS_EXECUTED_V_PIPE)
+-
+Each hardware thread on the Xeon Phi can execute two instruction simultaneously,
+one in the U-pipe and one in the V-pipe. But this is only possible if the
+instructions can be paired. The instructions executed in paired fashion are counted
+by the event INSTRUCTIONS_EXECUTED_V_PIPE. The event INSTRUCTIONS_EXECUTED increments
+for each instruction, hence the maximal increase per cycle can be 2.
diff --git a/groups/phi/READ_MISS_RATIO.txt b/groups/phi/READ_MISS_RATIO.txt
index c98f91b..dbdaad5 100644
--- a/groups/phi/READ_MISS_RATIO.txt
+++ b/groups/phi/READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data read
+SHORT Miss ratio fof data reads
 
 EVENTSET
 PMC0  DATA_READ
@@ -6,7 +6,10 @@ PMC1  DATA_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Read miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for data read
+Formulas:
+Read miss ratio = DATA_READ_MISS/DATA_READ
+--
+Miss ratio for data reads.
diff --git a/groups/phi/TLB.txt b/groups/phi/TLB.txt
new file mode 100644
index 0000000..6f00359
--- /dev/null
+++ b/groups/phi/TLB.txt
@@ -0,0 +1,23 @@
+SHORT TLB Misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_PAGE_WALK
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC1/time
+L2 TLB misses [misses/s] PMC0/time
+L1 TLB misses per L2 TLB miss PMC1/PMC0
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L1 TLB misses per L2 TLB miss = DATA_PAGE_WALK/LONG_DATA_PAGE_WALK
+-
+Analysis of the layered TLB of the Intel Xeon Phi. According to the book
+'Intel Xeon Phi Coprocessor High-Performance Programming' by James Jeffers and
+James Reinders, a high L1 TLB misses per L2 TLB miss ratio suggests that your
+working set fits into the L2 TLB but not in L1 TLB. Using large pages may be
+beneficial.
diff --git a/groups/phi/TLB_L1.txt b/groups/phi/TLB_L1.txt
new file mode 100644
index 0000000..d826d04
--- /dev/null
+++ b/groups/phi/TLB_L1.txt
@@ -0,0 +1,23 @@
+SHORT L1 TLB misses
+
+EVENTSET
+PMC0 DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC0/time
+L1 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L1 TLB miss ratio = DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L1 TLB misses. A L1 TLB miss that hits the
+L2 TLB has a penelty of about 25 cycles for 4kB pages. For 2MB pages, the penelty
+for a L1 TLB miss that hits L2 TLB is about 8 cycles. The minimal L1 TLB miss ratio
+is about 1/64, so a high ratio indicates a bad spartial locality. Data of a page
+is only partly accessed. It can also indicate trashing because when multiple pages
+are accessed in a loop iteration, the size and associativity is not sufficient to
+hold all pages.
diff --git a/groups/phi/TLB_L2.txt b/groups/phi/TLB_L2.txt
new file mode 100644
index 0000000..9a95125
--- /dev/null
+++ b/groups/phi/TLB_L2.txt
@@ -0,0 +1,21 @@
+SHORT L2 TLB misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 TLB misses [misses/s] PMC0/time
+L2 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L2 TLB miss ratio = LONG_DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L2 TLB misses. A L2 TLB miss has a penelty
+of at least 100 cycles, hence it is important to avoid them. A high ratio can
+indicate trashing because when multiple pages are accessed in a loop iteration,
+the size and associativity is not sufficient to hold all pages. This would also
+result in a bad ratio for the L1 TLB.
diff --git a/groups/phi/VECTOR.txt b/groups/phi/VECTOR.txt
index 1e91bc4..fd2e27f 100644
--- a/groups/phi/VECTOR.txt
+++ b/groups/phi/VECTOR.txt
@@ -1,4 +1,4 @@
-SHORT  Vector unit usage
+SHORT  Vectorization intensity
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,10 +6,16 @@ PMC1  VPU_ELEMENTS_ACTIVE
 
 METRICS
 Runtime (RDTSC) [s] time
-Vectorization Intensity PMC1/PMC0
+Vectorization intensity PMC1/PMC0
 
 LONG
+Formula:
+Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED
+-
 Vector instructions include instructions that perform floating-point
 operations, instructions that load vector registers from memory and store them
 to memory, instructions to manipulate vector mask registers, and other special
 purpose instructions such as vector shuffle.
+According to the book 'Intel Xeon Phi Coprocessor High-Performance Programming'
+by James Jeffers and James Reinders, the vectorization intensity should be >=8
+for double precision and >=16 for single precision.
diff --git a/groups/phi/VECTOR2.txt b/groups/phi/VECTOR2.txt
index 487460c..78e6b82 100644
--- a/groups/phi/VECTOR2.txt
+++ b/groups/phi/VECTOR2.txt
@@ -7,11 +7,13 @@ PMC1  VPU_STALL_REG
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
+VPU stall ratio [%] 100*(VPU_STALL_REG/PMC0)
 
 LONG
+VPU stall ratio [%] = 100*(VPU_STALL_REG/VPU_INSTRUCTIONS_EXECUTED)
+--
 This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
+regard to vectorization instruction throughput. The event VPU_STALL_REG counts
+the VPU stalls due to data dependencies. Dependencies are read-after-write,
+write-after-write and write-after-read.
 
diff --git a/groups/phi/VPU_FILL_RATIO_DBL.txt b/groups/phi/VPU_FILL_RATIO_DBL.txt
index 50d3835..6e8065c 100644
--- a/groups/phi/VPU_FILL_RATIO_DBL.txt
+++ b/groups/phi/VPU_FILL_RATIO_DBL.txt
@@ -1,4 +1,4 @@
-SHORT VPU filling for Double
+SHORT VPU filling for double precisiof data
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,7 +6,13 @@ PMC1  VPU_ELEMENTS_ACTIVE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPUFillRatio PMC0*8/PMC1
+VPU fill ratio PMC0*8/PMC1
 
 LONG
-VPU filling for Double
+Formulas:
+VPU fill ratio = VPU_INSTRUCTIONS_EXECUTED*8/VPU_ELEMENTS_ACTIVE
+--
+This performance group measures the number of vector instructions that are
+performed on each vector loaded to the VPU. It is important to increate the
+ratio to get a high throughput because memory accesses (loading data to the VPU)
+are expensive.
diff --git a/groups/phi/VPU_PAIRING.txt b/groups/phi/VPU_PAIRING.txt
index 998c1d7..024919b 100644
--- a/groups/phi/VPU_PAIRING.txt
+++ b/groups/phi/VPU_PAIRING.txt
@@ -1,4 +1,4 @@
-SHORT VPU Pairing ratio
+SHORT VPU pairing ratio
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,8 +6,15 @@ PMC1  VPU_INSTRUCTIONS_EXECUTED_V_PIPE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPipeRatio   PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
 
 LONG
-VPU Pairing ratio
+Formulas:
+V-pipe ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/VPU_INSTRUCTIONS_EXECUTED
+Pairing ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/(VPU_INSTRUCTIONS_EXECUTED-VPU_INSTRUCTIONS_EXECUTED_V_PIPE)
+--
+This performance group measures the pairing ratio of vector instructions. The
+V-pipe can only execute a subset of all instruction, the main workload is done
+by the U-pipe. A higher throughput can be achieved if the pairing ratio is
+increased.
diff --git a/groups/phi/VPU_READ_MISS_RATIO.txt b/groups/phi/VPU_READ_MISS_RATIO.txt
index 94ec963..502644a 100644
--- a/groups/phi/VPU_READ_MISS_RATIO.txt
+++ b/groups/phi/VPU_READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data read
+SHORT Miss ratio for VPU data reads
 
 EVENTSET
 PMC0  VPU_DATA_READ
@@ -6,7 +6,11 @@ PMC1  VPU_DATA_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU read miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for VPU data read
+Formula:
+VPU read miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between reads and reads that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/VPU_WRITE_MISS_RATIO.txt b/groups/phi/VPU_WRITE_MISS_RATIO.txt
index 429ee6d..b098b6f 100644
--- a/groups/phi/VPU_WRITE_MISS_RATIO.txt
+++ b/groups/phi/VPU_WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data write
+SHORT Miss ratio for VPU data writes
 
 EVENTSET
 PMC0  VPU_DATA_WRITE
@@ -6,7 +6,11 @@ PMC1  VPU_DATA_WRITE_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU write miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for VPU data write
+Formula:
+VPU write miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between writes and writes that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/WRITE_MISS_RATIO.txt b/groups/phi/WRITE_MISS_RATIO.txt
index 0544b0e..1e92c76 100644
--- a/groups/phi/WRITE_MISS_RATIO.txt
+++ b/groups/phi/WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data write
+SHORT Miss ratio fof data writes
 
 EVENTSET
 PMC0  DATA_WRITE
@@ -6,7 +6,10 @@ PMC1  DATA_WRITE_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Write miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for data write
+Formulas:
+Write miss ratio = DATA_WRITE_MISS/DATA_WRITE
+--
+Miss ratio fof data writes.
diff --git a/groups/sandybridge/BRANCH.txt b/groups/sandybridge/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/sandybridge/BRANCH.txt
+++ b/groups/sandybridge/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/sandybridge/CLOCK.txt b/groups/sandybridge/CLOCK.txt
index 0cc92d3..7a5e87d 100644
--- a/groups/sandybridge/CLOCK.txt
+++ b/groups/sandybridge/CLOCK.txt
@@ -8,7 +8,7 @@ PWR0  PWR_PKG_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
diff --git a/groups/sandybridge/DATA.txt b/groups/sandybridge/DATA.txt
index 5f04a23..967cbad 100644
--- a/groups/sandybridge/DATA.txt
+++ b/groups/sandybridge/DATA.txt
@@ -4,19 +4,19 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOP_RETIRED_LOADS
-PMC1  MEM_UOP_RETIRED_STORES
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/sandybridge/ENERGY.txt b/groups/sandybridge/ENERGY.txt
index 9261934..2b466c8 100644
--- a/groups/sandybridge/ENERGY.txt
+++ b/groups/sandybridge/ENERGY.txt
@@ -7,10 +7,11 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
@@ -18,16 +19,19 @@ Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
-Power PP0 [W] PWR1/time
+Power PP0 [W]  PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
+Power DRAM [W]  PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 SandyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
+monitor the consumed energy on the package (socket) level.
 
diff --git a/groups/sandybridge/FALSE_SHARE.txt b/groups/sandybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..a87f7d4
--- /dev/null
+++ b/groups/sandybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/sandybridge/FLOPS_AVX.txt b/groups/sandybridge/FLOPS_AVX.txt
index 6850bca..b4ae4e7 100644
--- a/groups/sandybridge/FLOPS_AVX.txt
+++ b/groups/sandybridge/FLOPS_AVX.txt
@@ -1,4 +1,4 @@
-SHORT Packed AVX MFlops/s
+SHORT Packed AVX MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -12,14 +12,15 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-32b packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
 
 LONG
 Formula:
-32b packed SP MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-32b packed DP MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
-Packed 32b AVX flops rates. Please note that the current flop measurements on SandyBridge are
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
 
diff --git a/groups/sandybridge/FLOPS_DP.txt b/groups/sandybridge/FLOPS_DP.txt
index cda580a..244e5ce 100644
--- a/groups/sandybridge/FLOPS_DP.txt
+++ b/groups/sandybridge/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 -
-SSE scalar and packed double precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
 
diff --git a/groups/sandybridge/FLOPS_SP.txt b/groups/sandybridge/FLOPS_SP.txt
index 753ade7..8cd8de2 100644
--- a/groups/sandybridge/FLOPS_SP.txt
+++ b/groups/sandybridge/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 -
-SSE scalar and packed single precision flop rates. Also shows packed AVX 32b
-flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
 
diff --git a/groups/sandybridge/ICACHE.txt b/groups/sandybridge/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/sandybridge/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridge/L2.txt b/groups/sandybridge/L2.txt
index 5345b7a..1feb44c 100644
--- a/groups/sandybridge/L2.txt
+++ b/groups/sandybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also output total data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
 L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
 
diff --git a/groups/sandybridge/L2CACHE.txt b/groups/sandybridge/L2CACHE.txt
index 3d7c36e..fbc3745 100644
--- a/groups/sandybridge/L2CACHE.txt
+++ b/groups/sandybridge/L2CACHE.txt
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/sandybridge/L3.txt b/groups/sandybridge/L3.txt
index 9a7c914..f63a918 100644
--- a/groups/sandybridge/L3.txt
+++ b/groups/sandybridge/L3.txt
@@ -5,28 +5,32 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L2_LINES_IN_ALL
-PMC1  L2_LINES_OUT_DIRTY_ALL
+PMC1  L2_TRANS_L2_WB
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
-evicted from the L2. This group also outputs data volume transfered between the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
 L3 and  measured cores L2 caches. Note that this bandwidth also includes data
 transfers due to a write allocate load on a store miss in L2.
 
diff --git a/groups/sandybridge/L3CACHE.txt b/groups/sandybridge/L3CACHE.txt
index d4fd89e..c1cd039 100644
--- a/groups/sandybridge/L3CACHE.txt
+++ b/groups/sandybridge/L3CACHE.txt
@@ -6,30 +6,30 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
-L3 Cache. L3 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L3 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L3 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/sandybridge/MEM.txt b/groups/sandybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/sandybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket.  Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket.  If a thread group contains multiple
-threads only one thread per socket will show the results.  Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/sandybridge/MEM_DP.txt b/groups/sandybridge/MEM_DP.txt
deleted file mode 100644
index 78fbd18..0000000
--- a/groups/sandybridge/MEM_DP.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions.  Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/MEM_SP.txt b/groups/sandybridge/MEM_SP.txt
deleted file mode 100644
index 1ede713..0000000
--- a/groups/sandybridge/MEM_SP.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions. Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot 
-trust these counters at the moment!
diff --git a/groups/sandybridge/RECOVERY.txt b/groups/sandybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/sandybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/sandybridge/TLB_DATA.txt b/groups/sandybridge/TLB_DATA.txt
index 2f59772..8d94e05 100644
--- a/groups/sandybridge/TLB_DATA.txt
+++ b/groups/sandybridge/TLB_DATA.txt
@@ -1,4 +1,4 @@
-SHORT  L1 Data TLB miss rate/ratio
+SHORT  L2 data TLB miss rate/ratio
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -16,20 +16,20 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
-The DTLB load and store miss rates gives a measure how often a TLB miss occured
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/sandybridge/TLB_INSTR.txt b/groups/sandybridge/TLB_INSTR.txt
index f95f78a..235d977 100644
--- a/groups/sandybridge/TLB_INSTR.txt
+++ b/groups/sandybridge/TLB_INSTR.txt
@@ -14,15 +14,15 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
-The ITLB miss rates gives a measure how often a TLB miss occured
+The ITLB miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
 
diff --git a/groups/sandybridge/UOPS.txt b/groups/sandybridge/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/sandybridge/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/sandybridge/UOPS_EXEC.txt b/groups/sandybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/sandybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridge/UOPS_ISSUE.txt b/groups/sandybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/sandybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridge/UOPS_RETIRE.txt b/groups/sandybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/sandybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/BRANCH.txt b/groups/sandybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/sandybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/sandybridgeEP/CACHES.txt b/groups/sandybridgeEP/CACHES.txt
new file mode 100644
index 0000000..889cca8
--- /dev/null
+++ b/groups/sandybridgeEP/CACHES.txt
@@ -0,0 +1,97 @@
+SHORT  Some data from the CBOXes
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-06*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
diff --git a/groups/sandybridgeEP/CLOCK.txt b/groups/sandybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..7a5e87d
--- /dev/null
+++ b/groups/sandybridgeEP/CLOCK.txt
@@ -0,0 +1,27 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/sandybridgeEP/DATA.txt b/groups/sandybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/groups/sandybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/sandybridgeEP/ENERGY.txt b/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..e5e2b33
--- /dev/null
+++ b/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W]  PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W]  PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/sandybridgeEP/FALSE_SHARE.txt b/groups/sandybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..be9c66c
--- /dev/null
+++ b/groups/sandybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,27 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Intel SandyBridge EP CPUs do not provide the events to measure the false-sharing
+over CPU socket boundaries.
diff --git a/groups/sandybridgeEP/FLOPS_AVX.txt b/groups/sandybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..b4ae4e7
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_DP.txt b/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..244e5ce
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_SP.txt b/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..8cd8de2
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/ICACHE.txt b/groups/sandybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/groups/sandybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridgeEP/L2.txt b/groups/sandybridgeEP/L2.txt
new file mode 100644
index 0000000..1feb44c
--- /dev/null
+++ b/groups/sandybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/sandybridgeEP/L2CACHE.txt b/groups/sandybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/groups/sandybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/L3.txt b/groups/sandybridgeEP/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/groups/sandybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/sandybridgeEP/L3CACHE.txt b/groups/sandybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..28766be
--- /dev/null
+++ b/groups/sandybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. The L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level as they were not
+stored in the L3 cache.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/MEM.txt b/groups/sandybridgeEP/MEM.txt
new file mode 100644
index 0000000..0be0645
--- /dev/null
+++ b/groups/sandybridgeEP/MEM.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Also outputs total data volume transferred from main memory.
+
diff --git a/groups/sandybridgeEP/MEM_DP.txt b/groups/sandybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..0193575
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_DP.txt
@@ -0,0 +1,59 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.  Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/MEM_SP.txt b/groups/sandybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..9e651fa
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_SP.txt
@@ -0,0 +1,61 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/NUMA.txt b/groups/sandybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/sandybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/sandybridgeEP/QPI.txt b/groups/sandybridgeEP/QPI.txt
new file mode 100644
index 0000000..f09df03
--- /dev/null
+++ b/groups/sandybridgeEP/QPI.txt
@@ -0,0 +1,35 @@
+SHORT QPI traffic between sockets
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS
+SBOX0C1 RXL_FLITS_G1_DRS_DATA
+SBOX0C2 RXL_FLITS_G2_NCB_DATA
+SBOX1C0 DIRECT2CORE_SUCCESS
+SBOX1C1 RXL_FLITS_G1_DRS_DATA
+SBOX1C2 RXL_FLITS_G2_NCB_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(SBOX0C0+SBOX1C0)*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(SBOX0C0+SBOX1C0)*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))
+
+LONG
+Formulas:
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))
+-
+Profiling group to measure traffic on the QPI.
diff --git a/groups/sandybridgeEP/RECOVERY.txt b/groups/sandybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/sandybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/sandybridgeEP/TLB_DATA.txt b/groups/sandybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridgeEP/TLB_INSTR.txt b/groups/sandybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridgeEP/UOPS.txt b/groups/sandybridgeEP/UOPS.txt
new file mode 100644
index 0000000..178aec5
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/sandybridgeEP/UOPS_EXEC.txt b/groups/sandybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/UOPS_ISSUE.txt b/groups/sandybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/sandybridgeEP/UOPS_RETIRE.txt b/groups/sandybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/sandybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/silvermont/BRANCH.txt b/groups/silvermont/BRANCH.txt
index cbaf834..b8d41b2 100644
--- a/groups/silvermont/BRANCH.txt
+++ b/groups/silvermont/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/silvermont/CLOCK.txt b/groups/silvermont/CLOCK.txt
new file mode 100644
index 0000000..088a776
--- /dev/null
+++ b/groups/silvermont/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/silvermont/DATA.txt b/groups/silvermont/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/groups/silvermont/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/silvermont/ENERGY.txt b/groups/silvermont/ENERGY.txt
index 5646a9a..d0996b3 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -15,10 +16,13 @@ CPI  FIXC1/FIXC0
 Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PKG_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
diff --git a/groups/silvermont/ICACHE.txt b/groups/silvermont/ICACHE.txt
index 6ce3ce8..5f11ad6 100644
--- a/groups/silvermont/ICACHE.txt
+++ b/groups/silvermont/ICACHE.txt
@@ -18,8 +18,8 @@ L1I miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/silvermont/L1TOL2.txt b/groups/silvermont/L1TOL2.txt
deleted file mode 100644
index 225533d..0000000
--- a/groups/silvermont/L1TOL2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT L2 load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_L1_MISS_LOADS 
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 Load [MBytes/s] = 1.0E-06*MEM_UOPS_RETIRED_L1_MISS_LOADS*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64/time
-L2 data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64
--
-Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between L2 and L1.
-
diff --git a/groups/silvermont/L2CACHE.txt b/groups/silvermont/L2CACHE.txt
new file mode 100644
index 0000000..32a1545
--- /dev/null
+++ b/groups/silvermont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_REFERENCE
+PMC1  LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/groups/silvermont/L2TOMEM.txt b/groups/silvermont/L2TOMEM.txt
deleted file mode 100644
index bc4cbed..0000000
--- a/groups/silvermont/L2TOMEM.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-SHORT L2 to Mem load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_L2_MISS_LOADS 
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L2 to MEM load bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 to MEM load data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 to MEM load bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64/time
-L2 to MEM load data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64
--
-Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between memory and L2.
-
diff --git a/groups/silvermont/MEM.txt b/groups/silvermont/MEM.txt
new file mode 100644
index 0000000..de78337
--- /dev/null
+++ b/groups/silvermont/MEM.txt
@@ -0,0 +1,37 @@
+SHORT Memory load bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_MISS
+PMC1  OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(PMC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS)*64/time
+Memory read data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS)*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(OFFCORE_RESPONSE_1_WB_ANY)*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth. The
+writeback metrics count only modified cache lines that are written back to go to
+exclusive state
+The group also output totally load and writeback data volume transferred between memory and L2.
+
diff --git a/groups/silvermont/MEM_LAT.txt b/groups/silvermont/MEM_LAT.txt
new file mode 100644
index 0000000..516b135
--- /dev/null
+++ b/groups/silvermont/MEM_LAT.txt
@@ -0,0 +1,23 @@
+SHORT Average data read latency
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT
+PMC1  OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Average data read latency [cyc/read] PMC0/PMC1
+
+LONG
+Formulas:
+Average data read latency [cyc/read] = OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT/OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+-
+The Offcore request facility of Intel Silvermont processors can be used to determine
+the average data read latency. It includes all operations done to read data like
+snoops and hits in upper cache levels.
diff --git a/groups/silvermont/TLB_DATA.txt b/groups/silvermont/TLB_DATA.txt
new file mode 100644
index 0000000..5f2617f
--- /dev/null
+++ b/groups/silvermont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_DTLB_COUNT
+PMC1  PAGE_WALKS_DTLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_DTLB_COUNT
+L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/silvermont/TLB_INSTR.txt b/groups/silvermont/TLB_INSTR.txt
new file mode 100644
index 0000000..f3dd3ec
--- /dev/null
+++ b/groups/silvermont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_ITLB_COUNT
+PMC1  PAGE_WALKS_ITLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_ITLB_COUNT
+L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/skylake/BRANCH.txt b/groups/skylake/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/groups/skylake/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/skylake/CLOCK.txt b/groups/skylake/CLOCK.txt
new file mode 100644
index 0000000..79a4480
--- /dev/null
+++ b/groups/skylake/CLOCK.txt
@@ -0,0 +1,27 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/skylake/DATA.txt b/groups/skylake/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/groups/skylake/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/skylake/ENERGY.txt b/groups/skylake/ENERGY.txt
new file mode 100644
index 0000000..06baa72
--- /dev/null
+++ b/groups/skylake/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/skylake/FALSE_SHARE.txt b/groups/skylake/FALSE_SHARE.txt
new file mode 100644
index 0000000..626277a
--- /dev/null
+++ b/groups/skylake/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_INST_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formula:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM/MEM_INST_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/groups/skylake/FLOPS_AVX.txt b/groups/skylake/FLOPS_AVX.txt
new file mode 100644
index 0000000..6088bca
--- /dev/null
+++ b/groups/skylake/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+
diff --git a/groups/skylake/FLOPS_DP.txt b/groups/skylake/FLOPS_DP.txt
new file mode 100644
index 0000000..c99d2c1
--- /dev/null
+++ b/groups/skylake/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/groups/skylake/FLOPS_SP.txt b/groups/skylake/FLOPS_SP.txt
new file mode 100644
index 0000000..a273e84
--- /dev/null
+++ b/groups/skylake/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/groups/skylake/ICACHE.txt b/groups/skylake/ICACHE.txt
new file mode 100644
index 0000000..aab7dac
--- /dev/null
+++ b/groups/skylake/ICACHE.txt
@@ -0,0 +1,30 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_64B_IFTAG_ALL
+PMC1  ICACHE_64B_IFTAG_MISS
+PMC2  ICACHE_64B_IFTAG_STALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/skylake/L2.txt b/groups/skylake/L2.txt
new file mode 100644
index 0000000..1a92a95
--- /dev/null
+++ b/groups/skylake/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/skylake/L2CACHE.txt b/groups/skylake/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/groups/skylake/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/skylake/L3.txt b/groups/skylake/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/groups/skylake/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/skylake/L3CACHE.txt b/groups/skylake/L3CACHE.txt
new file mode 100644
index 0000000..8c91d39
--- /dev/null
+++ b/groups/skylake/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_RETIRED_L3_HIT
+PMC1  MEM_LOAD_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0+PMC1)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/skylake/RECOVERY.txt b/groups/skylake/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/groups/skylake/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/groups/skylake/TLB_DATA.txt b/groups/skylake/TLB_DATA.txt
new file mode 100644
index 0000000..10ee5e1
--- /dev/null
+++ b/groups/skylake/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_ACTIVE
+PMC3  DTLB_STORE_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/skylake/TLB_INSTR.txt b/groups/skylake/TLB_INSTR.txt
new file mode 100644
index 0000000..9bc65a7
--- /dev/null
+++ b/groups/skylake/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/skylake/UOPS.txt b/groups/skylake/UOPS.txt
new file mode 100644
index 0000000..fbb01e1
--- /dev/null
+++ b/groups/skylake/UOPS.txt
@@ -0,0 +1,29 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs.
diff --git a/groups/skylake/UOPS_EXEC.txt b/groups/skylake/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/groups/skylake/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/skylake/UOPS_ISSUE.txt b/groups/skylake/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/groups/skylake/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/groups/skylake/UOPS_RETIRE.txt b/groups/skylake/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/groups/skylake/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/groups/westmere/BRANCH.txt b/groups/westmere/BRANCH.txt
index 3d81416..b8d41b2 100644
--- a/groups/westmere/BRANCH.txt
+++ b/groups/westmere/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/westmere/CACHE.txt b/groups/westmere/CACHE.txt
index 4ceed06..6a5e4fe 100644
--- a/groups/westmere/CACHE.txt
+++ b/groups/westmere/CACHE.txt
@@ -11,15 +11,16 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Data cache misses PMC0
-Data cache miss rate PMC0/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
 
 LONG
 Formulas:
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
 -
 This group measures the locality of your data accesses with regard to the
-L1 Cache. 
-The Data cache miss rate gives a measure how often it was necessary to get
-cachelines from higher levels of the memory hierarchy.
+L1 cache.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy.
 
diff --git a/groups/westmere/CLOCK.txt b/groups/westmere/CLOCK.txt
new file mode 100644
index 0000000..9139668
--- /dev/null
+++ b/groups/westmere/CLOCK.txt
@@ -0,0 +1,18 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+
+LONG
+Formula:
+-
+
+
diff --git a/groups/westmere/DATA.txt b/groups/westmere/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/westmere/DATA.txt
+++ b/groups/westmere/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/westmere/FLOPS_DP.txt b/groups/westmere/FLOPS_DP.txt
index c5ba91c..2773f06 100644
--- a/groups/westmere/FLOPS_DP.txt
+++ b/groups/westmere/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
 FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/westmere/FLOPS_SP.txt b/groups/westmere/FLOPS_SP.txt
index 4478c8f..8254fd9 100644
--- a/groups/westmere/FLOPS_SP.txt
+++ b/groups/westmere/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
 FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/westmere/FLOPS_X87.txt b/groups/westmere/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/westmere/FLOPS_X87.txt
+++ b/groups/westmere/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/westmere/ICACHE.txt b/groups/westmere/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmere/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmere/L2.txt b/groups/westmere/L2.txt
index 5506f1f..74f7d58 100644
--- a/groups/westmere/L2.txt
+++ b/groups/westmere/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also reports on data volume transfered between
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also reports of data volume transferred between
 L2 and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
 
diff --git a/groups/westmere/L2CACHE.txt b/groups/westmere/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/westmere/L2CACHE.txt
+++ b/groups/westmere/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/westmere/L3.txt b/groups/westmere/L3.txt
index 6a58f78..a1d95e3 100644
--- a/groups/westmere/L3.txt
+++ b/groups/westmere/L3.txt
@@ -4,28 +4,33 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_LINES_IN_ANY
-PMC1  L2_LINES_OUT_ANY
+PMC0  L2_RQSTS_MISS
+PMC1  L2_LINES_OUT_DIRTY_ANY
+
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*(PMC1)*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*(PMC1)*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_RQSTS_MISS*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_RQSTS_MISS*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ANY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ANY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 and the number of modified cachelines
+number of cache line allocated in the L2 and the number of modified cache lines
 evicted from the L2. The group also reports total data volume between L3 and
 the measured L2 cache. Note that this bandwidth also includes data transfers
 due to a write allocate load on a store miss in L2.
diff --git a/groups/westmere/L3CACHE.txt b/groups/westmere/L3CACHE.txt
index 944bc97..58072c1 100644
--- a/groups/westmere/L3CACHE.txt
+++ b/groups/westmere/L3CACHE.txt
@@ -1,36 +1,34 @@
 SHORT L3 cache miss rate/ratio
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_L3_HITS_ANY
 UPMC1  UNC_L3_MISS_ANY
-UPMC2  UNC_L3_LINES_IN_ANY
-UPMC3  UNC_L3_LINES_OUT_ANY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate   UPMC0/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
 L3 miss rate   UPMC1/FIXC0
 L3 miss ratio  UPMC1/(UPMC0+UPMC1)
 
 LONG
 Formulas:
-L3 request rate  UNC_L3_HITS_ANY / INSTR_RETIRED_ANY 
-L3 miss rate   UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio  UNC_L3_MISS_ANY / (UNC_L3_HITS_ANY + UNC_L3_MISS_ANY)
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
 -
 This group measures the locality of your data accesses with regard to the L3
 Cache. L3 request rate tells you how data intensive your code is or how many
-Data accesses you have in average per instruction. The L3 miss rate gives a
-measure how often it was necessary to get cachelines from memory. And finally
-L3 miss ratio tells you how many of your memory references required a cacheline
-to be loaded from a higher level. While the Data cache miss rate might be given
-by your algorithm you should try to get Data cache miss ratio as low as
+data accesses you have on average per instruction. The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level. While the data cache miss rate might be given
+by your algorithm you should try to get data cache miss ratio as low as
 possible by increasing your cache reuse.
 
 
diff --git a/groups/westmere/MEM.txt b/groups/westmere/MEM.txt
index f9e19ad..513ec60 100644
--- a/groups/westmere/MEM.txt
+++ b/groups/westmere/MEM.txt
@@ -1,37 +1,50 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_QMC_NORMAL_READS_ANY
 UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS 
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES 
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
 
 LONG
 Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-This group will be measured by one core per socket. The Remote  Read BW  tells
-you if cachelines are transfered between sockets, meaning that cores access
+This group will be measured by one core per socket. The remote read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
 data owned by a remote NUMA domain. The group also reports total data volume
-transfered from main memory.
+transferred from main memory.
 
diff --git a/groups/westmere/TLB.txt b/groups/westmere/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmere/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. 
-
diff --git a/groups/westmere/TLB_DATA.txt b/groups/westmere/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/groups/westmere/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmere/TLB_INSTR.txt b/groups/westmere/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/groups/westmere/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/westmere/UOPS.txt b/groups/westmere/UOPS.txt
new file mode 100644
index 0000000..9d738d0
--- /dev/null
+++ b/groups/westmere/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/groups/westmere/VIEW.txt b/groups/westmere/VIEW.txt
index a0708f4..76809ed 100644
--- a/groups/westmere/VIEW.txt
+++ b/groups/westmere/VIEW.txt
@@ -11,16 +11,16 @@ PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
 UPMC0  UNC_QMC_NORMAL_READS_ANY
 UPMC1  UNC_QMC_WRITES_FULL_ANY
 UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS 
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES 
+UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
+UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+DP MFLOP/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
+SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -33,8 +33,8 @@ Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+DP MFLOP/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+SP MFLOP/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
 Packed MUOPS/s   1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
 Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
 SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
@@ -45,6 +45,6 @@ Remote Read BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time
 Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
 Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
 -
-This is a overview group using the capabilities of westmere to measure multiple events at
+This is a overview group using the capabilities of Westmere to measure multiple events at
 the same time.
 
diff --git a/groups/westmereEX/BRANCH.txt b/groups/westmereEX/BRANCH.txt
index 3d81416..b8d41b2 100644
--- a/groups/westmereEX/BRANCH.txt
+++ b/groups/westmereEX/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
-The rates state how often in average a branch or a mispredicted branch occured
-per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate.
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
 
diff --git a/groups/westmereEX/CACHE.txt b/groups/westmereEX/CACHE.txt
index 490f982..eb160f6 100644
--- a/groups/westmereEX/CACHE.txt
+++ b/groups/westmereEX/CACHE.txt
@@ -11,14 +11,15 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Data cache misses PMC0
-Data cache miss rate PMC0/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
 
 LONG
 Formulas:
-Data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
 -
 This group measures the locality of your data accesses with regard to the L1
-Cache. The Data cache miss rate gives a measure how often it was necessary to
-get cachelines from higher levels of the memory hierarchy.
+cache. The data cache miss rate gives a measure how often it was necessary to
+get cache lines from higher levels of the memory hierarchy.
 
diff --git a/groups/westmereEX/DATA.txt b/groups/westmereEX/DATA.txt
index a5611bc..31bba51 100644
--- a/groups/westmereEX/DATA.txt
+++ b/groups/westmereEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/westmereEX/FLOPS_DP.txt b/groups/westmereEX/FLOPS_DP.txt
index a62cbe3..3e75cad 100644
--- a/groups/westmereEX/FLOPS_DP.txt
+++ b/groups/westmereEX/FLOPS_DP.txt
@@ -1,4 +1,4 @@
-SHORT Double Precision MFlops/s
+SHORT Double Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/westmereEX/FLOPS_SP.txt b/groups/westmereEX/FLOPS_SP.txt
index 1485615..601027b 100644
--- a/groups/westmereEX/FLOPS_SP.txt
+++ b/groups/westmereEX/FLOPS_SP.txt
@@ -1,4 +1,4 @@
-SHORT Single Precision MFlops/s
+SHORT Single Precision MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
+MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,10 +22,14 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
-The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
-Therefore both Single as well as Double precision are measured to ensure the correctness
+The Westmere EX has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
 of the measurements. You can check if your code was vectorized on the number of
-FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
 
diff --git a/groups/westmereEX/FLOPS_X87.txt b/groups/westmereEX/FLOPS_X87.txt
index 6447b93..a4176f0 100644
--- a/groups/westmereEX/FLOPS_X87.txt
+++ b/groups/westmereEX/FLOPS_X87.txt
@@ -1,4 +1,4 @@
-SHORT X87 MFlops/s
+SHORT X87 MFLOP/s
 
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
@@ -11,8 +11,8 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFlops/s  1.0E-06*PMC0/time
+X87 MFLOP/s  1.0E-06*PMC0/time
 
 LONG
-Profiling group to measure X87 flop rate.
+Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/westmereEX/ICACHE.txt b/groups/westmereEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmereEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmereEX/L2.txt b/groups/westmereEX/L2.txt
index 9201cd0..e950021 100644
--- a/groups/westmereEX/L2.txt
+++ b/groups/westmereEX/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. Also reports on total data volume transfered between L2
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. Also reports on total data volume transferred between L2
 and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+instruction cache.
 
diff --git a/groups/westmereEX/L2CACHE.txt b/groups/westmereEX/L2CACHE.txt
index 49778be..343b263 100644
--- a/groups/westmereEX/L2CACHE.txt
+++ b/groups/westmereEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,18 +18,17 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
-L2 Cache. L2 request rate tells you how data intensive your code is
-or how many Data accesses you have in average per instruction.
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
 The L2 miss rate gives a measure how often it was necessary to get
-cachelines from memory. And finally L2 miss ratio tells you how many of your
-memory references required a cacheline to be loaded from a higher level.
-While the Data cache miss rate might be given by your algorithm you should
-try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
 
 
diff --git a/groups/westmereEX/L3.txt b/groups/westmereEX/L3.txt
index f80761a..7e5cb04 100644
--- a/groups/westmereEX/L3.txt
+++ b/groups/westmereEX/L3.txt
@@ -12,21 +12,25 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is
-computed by the number of cacheline allocated in the L2 and the number of
-modified cachelines evicted from the L2. Also reports data volume transfered
+computed by the number of cache line allocated in the L2 and the number of
+modified cache lines evicted from the L2. Also reporto data volume transferred
 between L3 and L2 caches. Note that this bandwidth also includes data transfers
 due to a write allocate load on a store miss in L2.
 
diff --git a/groups/westmereEX/L3CACHE.txt b/groups/westmereEX/L3CACHE.txt
new file mode 100644
index 0000000..262f948
--- /dev/null
+++ b/groups/westmereEX/L3CACHE.txt
@@ -0,0 +1,52 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+CBOX8C0 LLC_HITS_ALL
+CBOX8C1 LLC_MISSES_ALL
+CBOX9C0 LLC_HITS_ALL
+CBOX9C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/westmereEX/MEM.txt b/groups/westmereEX/MEM.txt
index defa391..5d4fc62 100644
--- a/groups/westmereEX/MEM.txt
+++ b/groups/westmereEX/MEM.txt
@@ -1,19 +1,15 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK 
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK 
-BBOX0C1 IMT_INSERTS_WR 
-BBOX1C1 IMT_INSERTS_WR 
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
 
 
 METRICS
@@ -21,17 +17,22 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64
 
 LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+-
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+Addional to the bandwidth it also outputs the data volume.
 
diff --git a/groups/westmereEX/NUMA.txt b/groups/westmereEX/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/westmereEX/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/westmereEX/TLB.txt b/groups/westmereEX/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmereEX/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. 
-
diff --git a/groups/westmereEX/TLB_DATA.txt b/groups/westmereEX/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/groups/westmereEX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmereEX/TLB_INSTR.txt b/groups/westmereEX/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/groups/westmereEX/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/westmereEX/UOPS.txt b/groups/westmereEX/UOPS.txt
new file mode 100644
index 0000000..9d738d0
--- /dev/null
+++ b/groups/westmereEX/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formula:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/kernel/Makefile b/kernel/Makefile
index fd0ffdf..b9b814a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,4 +9,5 @@ modules_install:
 	install -m 666 enable_rdpmc.ko /lib/modules/$(shell uname -r)/extra/
 
 clean:
-	rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c
+	rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c .enable_rdpmc*.cmd
+	rm -rf .tmp_versions
diff --git a/kernel/README b/kernel/README
new file mode 100644
index 0000000..771b217
--- /dev/null
+++ b/kernel/README
@@ -0,0 +1,3 @@
+
+The kernel module enable_rdpmc is deprecated. Please use the sysfs entry
+/sys/devices/cpu/rdpmc to enable or disable the RDPMC instruction.
diff --git a/make/config_checks.mk b/make/config_checks.mk
new file mode 100644
index 0000000..ab266cf
--- /dev/null
+++ b/make/config_checks.mk
@@ -0,0 +1,49 @@
+
+ifneq ($(MAKECMDGOALS),docs)
+# determine kernel Version
+KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
+KERNEL_VERSION := $(shell uname -r | awk  '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
+KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
+
+HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 8 ]; then \
+               echo 0;  else echo 1; \
+			   fi; )
+HAS_PERFEVENT = $(shell if [ $(KERNEL_VERSION) -lt 6 -a $(KERNEL_VERSION_MAJOR) -lt 2 -a $(KERNEL_VERSION_MINOR) -lt 31 ]; then echo 0; else echo 1; fi; )
+
+# determine glibc Version
+GLIBC_VERSION := $(shell ldd --version | grep ldd |  awk '{ print $$NF }' | awk -F. '{ print $$2 }')
+
+HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
+               echo 0;  else echo 1; \
+			   fi; )
+
+INST_PREFIX := $(INSTALLED_PREFIX)
+ifneq "$(PREFIX)" "$(INST_PREFIX)"
+$(info Info: PREFIX and INSTALLED_PREFIX differ, be aware that you have to move stuff after make install from $(PREFIX) to $(INSTALLED_PREFIX). You can use make move for this.)
+endif
+
+FORTRAN_IF_NAME := likwid.mod
+ifneq ($(FORTRAN_INTERFACE),false)
+HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
+ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
+FORTRAN_IF=
+$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler $(FC) - not compiling it!)
+FORTRAN_INSTALL =
+FORTRAN_REMOVE =
+FORTRAN_REMOVE_MOVED =
+else
+FORTRAN_IF := $(FORTRAN_IF_NAME)
+FORTRAN_INSTALL = @echo "===> INSTALL fortran interface to $(PREFIX)/include/"; \
+                  cp -f likwid.mod  $(PREFIX)/include/$(FORTRAN_IF_NAME)
+FORTRAN_REMOVE = @echo "===> REMOVING fortran interface from $(PREFIX)/include/"; \
+                 rm -f $(PREFIX)/include/$(FORTRAN_IF_NAME)
+FORTRAN_REMOVE_MOVED = @echo "===> REMOVING fortran interface from $(INSTALLED_PREFIX)/include/"; \
+                 rm -f $(INSTALLED_PREFIX)/include/$(FORTRAN_IF_NAME)
+endif
+else
+FORTRAN_IF =
+FORTRAN_INSTALL =
+FORTRAN_REMOVE =
+FORTRAN_REMOVE_MOVED =
+endif
+endif
diff --git a/make/config_defines.mk b/make/config_defines.mk
new file mode 100644
index 0000000..f2b632c
--- /dev/null
+++ b/make/config_defines.mk
@@ -0,0 +1,117 @@
+DEFINES   += -DVERSION=$(VERSION)         \
+		 -DRELEASE=$(RELEASE)                 \
+		 -DCFGFILE=$(CFG_FILE_PATH)           \
+		 -DTOPOFILE=$(TOPO_FILE_PATH)           \
+		 -DINSTALL_PREFIX=$(INSTALLED_PREFIX) \
+		 -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
+		 -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
+		 -DACCESSDAEMON=$(INSTALLED_ACCESSDAEMON) \
+		 -DGROUPPATH=$(LIKWIDGROUPPATH) \
+		 -D_GNU_SOURCE
+
+DYNAMIC_TARGET_LIB := liblikwid.so
+STATIC_TARGET_LIB := liblikwid.a
+
+LUA_FOLDER := ext/lua
+SHARED_LIBLUA := liblikwid-lua.so
+STATIC_LIBLUA := liblikwid-lua.a
+HWLOC_FOLDER := ext/hwloc
+STATIC_LIBHWLOC := liblikwid-hwloc.a
+SHARED_LIBHWLOC := liblikwid-hwloc.so
+
+BENCH_FOLDER := bench
+BENCH_NAME := likwid-bench
+BENCH_TARGET := $(BENCH_FOLDER)/$(BENCH_NAME)
+
+ifneq ($(COLOR),NONE)
+DEFINES += -DCOLOR=$(COLOR)
+endif
+
+ifeq ($(BUILDDAEMON),true)
+ifneq ($(COMPILER),MIC)
+    DAEMON_TARGET = likwid-accessD
+else
+    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
+    DAEMON_TARGET =
+endif
+endif
+
+ifeq ($(BUILDFREQ),true)
+ifneq ($(COMPILER),MIC)
+    FREQ_TARGET = likwid-setFreq
+else
+    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-setFreq.);
+endif
+endif
+
+ifeq ($(HAS_MEMPOLICY),1)
+DEFINES += -DHAS_MEMPOLICY
+else
+$(info Kernel 2.6.$(KERNEL_VERSION) has no mempolicy support!);
+endif
+
+
+ifeq ($(SHARED_LIBRARY),true)
+CFLAGS += $(SHARED_CFLAGS)
+LIBS += -L. -pthread -lm -ldl
+TARGET_LIB := $(DYNAMIC_TARGET_LIB)
+TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(SHARED_LIBHWLOC)
+TARGET_LUA_LIB=$(LUA_FOLDER)/$(SHARED_LIBLUA)
+else
+TARGET_LIB := $(STATIC_TARGET_LIB)
+TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(STATIC_LIBHWLOC)
+TARGET_LUA_LIB=$(LUA_FOLDER)/$(STATIC_LIBLUA)
+endif
+
+ifeq ($(HAS_SCHEDAFFINITY),1)
+DEFINES += -DHAS_SCHEDAFFINITY
+PINLIB  = liblikwidpin.so
+else
+$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
+PINLIB  =
+endif
+
+FILTER_HWLOC_OBJ = yes
+LIBHWLOC =
+ifeq ($(USE_HWLOC),true)
+DEFINES += -DLIKWID_USE_HWLOC
+LIBHWLOC_SHARED = -Lext/hwloc/ -lliblikwid-hwloc
+LIBHWLOC_STATIC = ext/hwloc/liblikwid-hwloc.a
+EXT_TARGETS += ./ext/hwloc
+FILTER_HWLOC_OBJ =
+endif
+
+#DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
+
+ifeq ($(ACCESSMODE),sysdaemon)
+ifneq ($(COMPILER),MIC)
+DEFINES += -DACCESSMODE=2
+else
+$(info Info: Compiling for Xeon Phi. Changing accessmode to direct.);
+ACCESSMODE = direct
+DEFINES += -DACCESSMODE=0
+endif
+else
+ifeq ($(ACCESSMODE),accessdaemon)
+ifneq ($(COMPILER),MIC)
+ifneq ($(BUILDDAEMON),true)
+$(info Info: Compiling with accessdaemon access mode but without building the access daemon.);
+$(info Info: Make sure an accessdaemon is installed and the paths ACCESSDAEMON and INSTALLED_ACCESSDAEMON point to it);
+endif
+DEFINES += -DACCESSMODE=1
+else
+$(info Info: Compiling for Xeon Phi. Changing accessmode to direct.);
+DEFINES += -DACCESSMODE=0
+ACCESSMODE = direct
+endif
+else
+DEFINES += -DACCESSMODE=0
+endif
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
diff --git a/make/include_CLANG.mk b/make/include_CLANG.mk
new file mode 100644
index 0000000..4806e01
--- /dev/null
+++ b/make/include_CLANG.mk
@@ -0,0 +1,28 @@
+CC  = clang
+FC  = ifort
+AS  = as
+AR  = ar
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
+
+ANSI_CFLAGS   =
+
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
+FCFLAGS  = -module ./  # ifort
+#FCFLAGS  = -J ./  -fsyntax-only  #gfortran
+PASFLAGS  = x86-64
+ASFLAGS  =
+CPPFLAGS =
+LFLAGS   =  -pthread
+
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
+
+DEFINES  = -DPAGE_ALIGNMENT=4096
+DEFINES  += -DLIKWID_MONITOR_LOCK
+DEFINES  += -DDEBUGLEV=0
+
+INCLUDES =
+LIBS     = -lm -lrt
diff --git a/make/include_GCC.mk b/make/include_GCC.mk
index 1ccfd88..72850a1 100644
--- a/make/include_GCC.mk
+++ b/make/include_GCC.mk
@@ -7,28 +7,27 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl
 GEN_PMHEADER = ./perl/gen_events.pl
 
-#ANSI_CFLAGS   = -std=c99
+ANSI_CFLAGS   =
 #ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   =  -O2  -Wno-format -Wno-nonnull -std=c99
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
 FCFLAGS  = -module ./  # ifort
 #FCFLAGS  = -J ./  -fsyntax-only  #gfortran
 PASFLAGS  = x86-64
-ASFLAGS  =
+ASFLAGS  = 
 CPPFLAGS =
 LFLAGS   =  -pthread
 
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
 
-DEFINES  = -D_GNU_SOURCE
-DEFINES  += -DPAGE_ALIGNMENT=4096
+DEFINES  = -DPAGE_ALIGNMENT=4096
 DEFINES  += -DLIKWID_MONITOR_LOCK
 DEFINES  += -DDEBUGLEV=0
 
 INCLUDES =
-LIBS     = -lm
+LIBS     = -lm -lrt
 
 
diff --git a/make/include_GCCX86.mk b/make/include_GCCX86.mk
index 19add95..5ebef9a 100644
--- a/make/include_GCCX86.mk
+++ b/make/include_GCCX86.mk
@@ -1,25 +1,25 @@
 CC  = gcc
 AS  = as
 AR  = ar
-PAS = ./perl/AsmGen.pl
-GEN_PAS = ./perl/generatePas.pl
-GEN_GROUPS = ./perl/generateGroups.pl
-GEN_PMHEADER = ./perl/gen_events.pl
+PAS = ./perl/AsmGen.pl 
+GEN_PAS = ./perl/generatePas.pl 
+GEN_GROUPS = ./perl/generateGroups.pl 
+GEN_PMHEADER = ./perl/gen_events.pl 
 
-#ANSI_CFLAGS   = -std=c99
+ANSI_CFLAGS   = -std=c99
 #ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   =  -O2 -m32 -Wno-format -std=c99
+CFLAGS   =  -O2 -g -m32 -Wno-format -fPIC
 FCFLAGS  = -J ./  -fsyntax-only
 PASFLAGS  = x86
-ASFLAGS  = --32
+ASFLAGS  = --32 -g
 CPPFLAGS =
-LFLAGS   = -m32 -pthread
+LFLAGS   = -m32 -g -pthread
 
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fpic -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
 
 DEFINES  = -D_GNU_SOURCE
 DEFINES  += -DPAGE_ALIGNMENT=4096
@@ -27,6 +27,6 @@ DEFINES  += -DLIKWID_MONITOR_LOCK
 DEFINES  += -DDEBUGLEV=0
 
 INCLUDES =
-LIBS     = -lm
+LIBS     = -lm -lrt
 
 
diff --git a/make/include_ICC.mk b/make/include_ICC.mk
index ce49bfe..9dfe66b 100644
--- a/make/include_ICC.mk
+++ b/make/include_ICC.mk
@@ -7,24 +7,22 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl 
 GEN_PMHEADER = ./perl/gen_events.pl 
 
-ANSI_CFLAGS += -std=c99
+ANSI_CFLAGS  = -std=c99 #-strict-ansi
 
-CFLAGS   =  -O1 -Wno-format
-FCFLAGS  = -module ./
+CFLAGS   =  -O1 -Wno-format -vec-report=0 -fPIC -pthread
+FCFLAGS  = -module ./ 
 ASFLAGS  = -gdwarf-2
 PASFLAGS  = x86-64
 CPPFLAGS =
 LFLAGS   = -pthread
 
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fPIC -pthread -fvisibility=hidden
+SHARED_LFLAGS = -shared -pthread -fvisibility=hidden
 
 DEFINES  = -D_GNU_SOURCE
 DEFINES  += -DPAGE_ALIGNMENT=4096
-#enable this option to build likwid-bench with marker API for likwid-perfctr
-#DEFINES  += -DPERFMON
 
 INCLUDES =
-LIBS     =
+LIBS     = -lrt
 
 
diff --git a/make/include_MIC.mk b/make/include_MIC.mk
index aa3c39a..b63efce 100644
--- a/make/include_MIC.mk
+++ b/make/include_MIC.mk
@@ -1,5 +1,5 @@
 CC  = icc
-FC  = gfortran
+FC  = ifort
 AS  = icc
 AR  = ar
 PAS = ./perl/AsmGen.pl 
@@ -7,27 +7,31 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl 
 GEN_PMHEADER = ./perl/gen_events.pl 
 
-#ANSI_CFLAGS   = -std=c99
-#ANSI_CFLAGS += -pedantic
+ANSI_CFLAGS   = -std=c99 -fPIC
+ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   = -mmic -O2 -Wno-format -std=c99
+CFLAGS   = -mmic -O1 -g -Wno-format -fPIC
 FCFLAGS  = -J ./  -fsyntax-only
 #FCFLAGS  = -module ./ 
-ASFLAGS  =  -mmic -c
+ASFLAGS  =  -mmic -c -x assembler
 PASFLAGS  = x86-64
 CPPFLAGS =
 LFLAGS   =  -pthread -g -mmic
 
-SHARED_CFLAGS = -fpic -mmic
-SHARED_LFLAGS = -shared -mmic
+SHARED_CFLAGS = -fpic -mmic -fvisibility=hidden
+SHARED_LFLAGS = -shared -mmic -fvisibility=hidden
 
 DEFINES  = -D_GNU_SOURCE
 DEFINES  += -DPAGE_ALIGNMENT=4096
 DEFINES  += -DDEBUGLEV=0
 
 INCLUDES =
-LIBS     = -lm
-
+LIBS     = -lm -lrt
 
+# colon seperated list of paths to search for libs at runtime on Phi file system
+MIC_LIB_RPATHS = /opt/intel/compilers_and_libraries_2016.1.150/linux/compiler/lib/mic
+ifneq (strip $(MIC_LIB_RPATHS),)
+RPATHS += -Wl,-rpath=$(MIC_LIB_RPATHS)
+endif 
diff --git a/monitoring/README.agent b/monitoring/README.agent
new file mode 100644
index 0000000..756d015
--- /dev/null
+++ b/monitoring/README.agent
@@ -0,0 +1,66 @@
+The likwid-agent application is a daemon that reads hardware performance
+counters in a periodic fashion. Which counters can be measured is determined by
+the system's CPU architecture. Each architecture has its own set of events and
+corresponding counter registers. For the measurement the likwid library is used
+and interfaced through the Lua interface. The measured values can be exported in
+multiple ways like RRD, syslog or gmetric from the Ganglia Monitoring System.
+
+
+The configuration file needs to be given at startup and has the following
+format:
+GROUPPATH <PATH_TO_GROUPS> # default is set during installation
+EVENTSET <SPACE_SEPARATED_LIST_OF_GROUPS>
+DURATION <TIME_IN_SECONDS_TO_MEASURE_EACH_GROUP>
+ACCESSMODE <0/1> # 0 is direct access, 1 forward access to the accessDaemon
+LOGPATH <PATH_TO_STORE_LOGFILES> # each montitoring group creates a logfile there named likwid.<GROUP>.log
+LOGSTYLE <log/update> # log appends new lines, update clears file previously
+GMETRIC <True/False> # send measured values to Gangla
+GMETRICPATH <PATH_TO_THE_GMETRIC_EXECUTABLE>
+GMETRICCONFIG <EXTRA_CONFIG_OPTIONS_TO_GMETRIC>
+RRD <True/False> # write measured values to RRD files, one RRD per group
+RRDPATH <PATH_TO_STORE_RRD_FILES>
+SYSLOG <True/False> # write measured values to syslog
+SYSLOGPRIO <prio> # Use priority level <prio> for syslog, default is local0.notice
+
+
+
+The group files cannot lie directly in GROUPPATH, you need to create a folder
+with the short name of the architecture like sandybridge or ivybridge. This
+enables to use the same group path distributed over a set of systems with different
+CPU architecture. The format of a group file is the following:
+SHORT <SHORT_NAME_OF_THE GROUP>
+
+EVENTSET // Starts event/counter definitions
+FIXC0 INSTR_RETIRED_ANY // Measure event INSTR_RETIRED_ANY in counter FIXC0
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS // Starts section of derived metrics and output items
+ONCE Runtime (RDTSC) [s] time # Output runtime only once
+MIN CPI FIXC1/FIXC0 # Output the minimum of the formula FIXC1/FIXC0 named CPI
+AVG CPI FIXC1/FIXC0 # Output the average of the same formula
+MAX L2 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time # Calculate bandwidth and output only the maximum
+MIN L2 load data volume [GBytes]  1.0E-09*PMC0*64.0
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time # Sum up all the values of all CPUs
+SUM L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+<LONG DESCRIPTION OF THE GROUP>
+
+Possible functions are:
+ONCE: Output only once (CPU core 0), no aggregation is done
+MIN: Output the minimum of all cores
+MAX: Output the maximum of all cores
+AVG: Output the average of all cores
+SUM: Output the sum of all cores' values
+If no function is set, the values of all HW threads is written to output and
+T<ID> is written in front of the name.
+
+The output metric names can be equal, the function is glued to the output name for later separation.
+
diff --git a/monitoring/groups/atom/BW_MEM.txt b/monitoring/groups/atom/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/atom/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/atom/FLOPS_DP.txt b/monitoring/groups/atom/FLOPS_DP.txt
new file mode 100644
index 0000000..14961f0
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_DP.txt
@@ -0,0 +1,13 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
+
+
+LONG
+Double Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/atom/FLOPS_SP.txt b/monitoring/groups/atom/FLOPS_SP.txt
new file mode 100644
index 0000000..d67704f
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_SP.txt
@@ -0,0 +1,12 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+
+METRICS
+SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+
+LONG
+Single Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/broadwell/BW.txt b/monitoring/groups/broadwell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwell/ENERGY.txt b/monitoring/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/broadwell/FLOPS_DP.txt b/monitoring/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..53b2463
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,22 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision flop rates.
+
diff --git a/monitoring/groups/broadwell/FLOPS_SP.txt b/monitoring/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..b04f87a
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,22 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision flop rates.
+
diff --git a/monitoring/groups/broadwellEP/BW.txt b/monitoring/groups/broadwellEP/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwellEP/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwellEP/ENERGY.txt b/monitoring/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/core2/BW_L2.txt b/monitoring/groups/core2/BW_L2.txt
new file mode 100644
index 0000000..6d73bf8
--- /dev/null
+++ b/monitoring/groups/core2/BW_L2.txt
@@ -0,0 +1,11 @@
+SHORT Cache bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/core2/BW_MEM.txt b/monitoring/groups/core2/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/core2/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/haswell/BW.txt b/monitoring/groups/haswell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/haswell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswell/ENERGY.txt b/monitoring/groups/haswell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/haswellEP/BW.txt b/monitoring/groups/haswellEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/haswellEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswellEP/ENERGY.txt b/monitoring/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/interlagos/BW.txt b/monitoring/groups/interlagos/BW.txt
new file mode 100644
index 0000000..3f465f6
--- /dev/null
+++ b/monitoring/groups/interlagos/BW.txt
@@ -0,0 +1,16 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_REFILLS_SYSTEM
+PMC2  L2_FILL_WB_FILL
+PMC3  L2_FILL_WB_WB
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0-PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/interlagos/CPI.txt b/monitoring/groups/interlagos/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/interlagos/CPI.txt
@@ -0,0 +1,19 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+CPI   PMC1/PMC0
+Cycles per UOPS  PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/interlagos/FLOPS.txt b/monitoring/groups/interlagos/FLOPS.txt
new file mode 100644
index 0000000..7bfb29a
--- /dev/null
+++ b/monitoring/groups/interlagos/FLOPS.txt
@@ -0,0 +1,18 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  RETIRED_FLOPS_DOUBLE_ALL
+PMC1  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0)/time
+SP MFlops/s    1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+-
+Profiling group to measure double precisision flop rate.
+
+
diff --git a/monitoring/groups/ivybridge/BW.txt b/monitoring/groups/ivybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/ivybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridge/ENERGY.txt b/monitoring/groups/ivybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridge/FLOPS_DP.txt b/monitoring/groups/ivybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridge/FLOPS_SP.txt b/monitoring/groups/ivybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/BW.txt b/monitoring/groups/ivybridgeEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/ENERGY.txt b/monitoring/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_DP.txt b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_SP.txt b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/kabini/BW.txt b/monitoring/groups/kabini/BW.txt
new file mode 100644
index 0000000..7e34078
--- /dev/null
+++ b/monitoring/groups/kabini/BW.txt
@@ -0,0 +1,14 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_EVICTED_ALL
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s]   1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/kabini/CPI.txt b/monitoring/groups/kabini/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/kabini/CPI.txt
@@ -0,0 +1,19 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+CPI   PMC1/PMC0
+Cycles per UOPS  PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/kabini/FLOPS.txt b/monitoring/groups/kabini/FLOPS.txt
new file mode 100644
index 0000000..ccb2f92
--- /dev/null
+++ b/monitoring/groups/kabini/FLOPS.txt
@@ -0,0 +1,14 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  RETIRED_FLOPS_DOUBLE_ALL
+PMC1  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0)/time
+SP MFlops/s    1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
diff --git a/monitoring/groups/nehalem/BW.txt b/monitoring/groups/nehalem/BW.txt
new file mode 100644
index 0000000..ddc8c82
--- /dev/null
+++ b/monitoring/groups/nehalem/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/nehalem/CPI.txt b/monitoring/groups/nehalem/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/nehalem/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/nehalem/FLOPS.txt b/monitoring/groups/nehalem/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalem/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/nehalemEX/BW.txt b/monitoring/groups/nehalemEX/BW.txt
new file mode 100644
index 0000000..473ce76
--- /dev/null
+++ b/monitoring/groups/nehalemEX/BW.txt
@@ -0,0 +1,29 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+
+LONG
+Formula:
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time
+
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
diff --git a/monitoring/groups/nehalemEX/CPI.txt b/monitoring/groups/nehalemEX/CPI.txt
new file mode 100644
index 0000000..0e4faa3
--- /dev/null
+++ b/monitoring/groups/nehalemEX/CPI.txt
@@ -0,0 +1,12 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/nehalemEX/FLOPS.txt b/monitoring/groups/nehalemEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalemEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/pentiumm/BW.txt b/monitoring/groups/pentiumm/BW.txt
new file mode 100644
index 0000000..5877abc
--- /dev/null
+++ b/monitoring/groups/pentiumm/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L2_LINES_IN_ALL_ALL
+PMC1  L2_LINES_OUT_ALL_ALL
+
+METRICS
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
+Formulas:
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
diff --git a/monitoring/groups/pentiumm/CPI.txt b/monitoring/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..fb0d97b
--- /dev/null
+++ b/monitoring/groups/pentiumm/CPI.txt
@@ -0,0 +1,17 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  UOPS_RETIRED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/phi/CPI.txt b/monitoring/groups/phi/CPI.txt
new file mode 100644
index 0000000..0ce61cd
--- /dev/null
+++ b/monitoring/groups/phi/CPI.txt
@@ -0,0 +1,17 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  INSTRUCTIONS_EXECUTED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/sandybridge/BW.txt b/monitoring/groups/sandybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/sandybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridge/ENERGY.txt b/monitoring/groups/sandybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridge/FLOPS_DP.txt b/monitoring/groups/sandybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridge/FLOPS_SP.txt b/monitoring/groups/sandybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/BW.txt b/monitoring/groups/sandybridgeEP/BW.txt
new file mode 100644
index 0000000..18eea4f
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/BW.txt
@@ -0,0 +1,24 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/ENERGY.txt b/monitoring/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_DP.txt b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_SP.txt b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/silvermont/BW.txt b/monitoring/groups/silvermont/BW.txt
new file mode 100644
index 0000000..952e64a
--- /dev/null
+++ b/monitoring/groups/silvermont/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  LONGEST_LAT_CACHE_MISS
+PMC1  OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+SUM Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/silvermont/CPI.txt b/monitoring/groups/silvermont/CPI.txt
new file mode 100644
index 0000000..4eb4d40
--- /dev/null
+++ b/monitoring/groups/silvermont/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+CPI FIXC0/FIXC1
+IPC FIXC1/FIXC0
+
+LONG
+CPI = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
+IPC = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/silvermont/ENERGY.txt b/monitoring/groups/silvermont/ENERGY.txt
new file mode 100644
index 0000000..3814560
--- /dev/null
+++ b/monitoring/groups/silvermont/ENERGY.txt
@@ -0,0 +1,16 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/westmere/BW.txt b/monitoring/groups/westmere/BW.txt
new file mode 100644
index 0000000..4925077
--- /dev/null
+++ b/monitoring/groups/westmere/BW.txt
@@ -0,0 +1,19 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_ANY
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/westmere/CPI.txt b/monitoring/groups/westmere/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmere/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmere/FLOPS.txt b/monitoring/groups/westmere/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmere/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/westmereEX/BW.txt b/monitoring/groups/westmereEX/BW.txt
new file mode 100644
index 0000000..a960025
--- /dev/null
+++ b/monitoring/groups/westmereEX/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_ANY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+
+LONG
diff --git a/monitoring/groups/westmereEX/CPI.txt b/monitoring/groups/westmereEX/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmereEX/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmereEX/FLOPS.txt b/monitoring/groups/westmereEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmereEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/likwid-agent.conf b/monitoring/likwid-agent.conf
new file mode 100644
index 0000000..7aadbda
--- /dev/null
+++ b/monitoring/likwid-agent.conf
@@ -0,0 +1,52 @@
+### Global section ###
+
+# Set path to monitoring group files. Default is the normal LIKWID group path
+# <INSTALLEDPREFIX>/share/likwid/mongroups
+#GROUPPATH <path_to_mon_groups>
+# List of monitoring groups that should be measured
+#EVENTSET <group1> <group2> ...
+# Define access mode for LIKWID. If likwid-agent runs as root, use 0 for direct
+# access to the MSR and PCI registers. If you are running it as common user, you
+# have to select 1 to use the accessDaemon of LIKWID. Default is 1.
+#ACCESSMODE <0/1>
+# Define the time in seconds that each given monitoring group should be measured
+#DURATION 1
+
+
+### Output section ###
+
+## Simple logfile output ##
+# Specify path for the logfile. For each monitoring group a own logfile is
+# created with the format likwid.<group>.log
+#LOGPATH <path>
+# Specify the logfile writing style. The two possible options are log and
+# update.
+# log appends all new messages to the logfile, while update empties the logfile
+# before performing any writing. The update option is recommended when the
+# output is further parsed with other tools. If LOGPATH is set but no LOGSTYLE
+# set, the style log is selected.
+#LOGSTYLE <log/update>
+
+## Syslog output ##
+# De/Activate the output to the syslog system using shell tool logger
+#SYSLOG <True/False>
+# Define the priority value for logger. Default priority is local0.notice.
+#SYSLOGPRIO local0.notice
+
+## RRD output ##
+# Likwid-agent tries to create basic RRD configurations for the selected
+# groups. Each monitoring group gets its own RRD file containing all metrics
+# as data sources. For better printing, RRAs are created to hold the min, max
+# and average values for every 10 minutes in the last hour, every hour for the
+# last day and every day for the last month.
+#RRD <True/False>
+# Store the RRDs in RRDPATH
+#RRDPATH <path>
+
+## GMetric output ##
+# De/Activate the output to the Ganglia Monitoring System using the gmetric tool
+#GMETRIC <True/False>
+# Set path to the executable of gmetric.
+#GMETRICPATH <path_to_gmetric>
+# In some environments they need to hand over a special config file for gmetric.
+#GMETRICCONFIG <path_to_gmetric_config>
diff --git a/perl/AsmGen.pl b/perl/AsmGen.pl
deleted file mode 100755
index dcd7946..0000000
--- a/perl/AsmGen.pl
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-no strict "refs";
-use warnings;
-use lib './perl';
-use Parse::RecDescent;
-use Data::Dumper;
-use Getopt::Std;
-use Cwd 'abs_path';
-
-use gas;
-
-my $ROOT = abs_path('./');
-my $DEBUG=0;
-my $VERBOSE=0;
-our $ISA = 'x86';
-our $AS  = 'gas';
-my $OPT_STRING = 'hpvda:i:o:';
-my %OPT;
-my $INPUTFILE;
-my $OUTPUTFILE;
-my $CPP_ARGS='';
-
-# Enable warnings within the Parse::RecDescent module.
-$::RD_ERRORS = 1; # Make sure the parser dies when it encounters an error
-#$::RD_WARN   = 1; # Enable warnings. This will warn on unused rules &c.
-#$::RD_HINT   = 1; # Give out hints to help fix problems.
-#$::RD_TRACE  = 1;     # if defined, also trace parsers' behaviour
-$::RD_AUTOACTION = q { [@item[0..$#item]] };
-
-sub init
-{
-	getopts( "$OPT_STRING", \%OPT ) or usage();
-	if ($OPT{h}) { usage(); };
-	if ($OPT{v}) { $VERBOSE = 1;}
-	if ($OPT{d}) { $DEBUG = 1;}
-
-	if (! $ARGV[0]) {
-		die "ERROR: Please specify a input file!\n\nCall script with argument -h for help.\n";
-	}
-
-	$INPUTFILE = $ARGV[0];
-	$CPP_ARGS = $ARGV[1] if ($ARGV[1]);
-
-	if ($INPUTFILE =~ /.pas$/) {
-		$INPUTFILE =~ s/\.pas//; 
-	} else {
-		die "ERROR: Input file must have pas ending!\n";
-	}
-	if ($OPT{o}) { 
-		$OUTPUTFILE = $OPT{o};
-	}else {
-		$OUTPUTFILE = "$INPUTFILE.s";
-	}
-	if ($OPT{i}) { 
-		$ISA = $OPT{i};
-		print "INFO: Using isa $ISA.\n\n" if ($VERBOSE);
-	} else {
-		print "INFO: No isa specified.\n Using default $ISA.\n\n" if ($VERBOSE);
-	}
-	if ($OPT{a}) { 
-		$AS = $OPT{a};
-		print "INFO: Using as $AS.\n\n" if ($VERBOSE);
-	} else {
-		print "INFO: No as specified.\n Using default $AS.\n\n" if ($VERBOSE);
-	}
-
-  as::isa_init();
-}
-
-sub usage
-{
-    print <<END;
-usage: $0 [-$OPT_STRING]  <INFILE>
-
-Required:
-<INFILE>  : Input pas file
-
-Optional:
--h        : this (help) message
--v        : verbose output
--d        : debug mode: prints out the parse tree
--p        : Print out intermediate preprocessed output
--o <FILE> : Output file
--a <ASM>  : Specify different assembler (Default: gas)
--i <ISA>  : Specify different isa (Default: x86)
-
-Example: 
-$0 -i x86-64  -a masm -o out.s  myfile.pas
-
-END
-
-exit(0);
-}
-
-#=======================================
-# GRAMMAR
-#=======================================
-$main::grammar = <<'_EOGRAMMAR_';
-# Terminals
-FUNC        : /func/i
-LOOP        : /loop/i
-ALLOCATE    : /allocate/i
-FACTOR      : /factor/i
-DEFINE      : /define/i
-USE         : /use/i
-STOP        : /stop/i
-START       : /start/i
-LOCAL       : /local/i
-TIMER       : /timer/i
-INCREMENT   : /increment/i
-ALIGN       : /align/i
-INT         : /int/i
-SINGLE      : /single/i
-DOUBLE      : /double/i
-INUMBER     : NUMBER
-UNUMBER     : NUMBER
-SNUMBER     : NUMBER
-FNUMBER     : NUMBER
-OFFSET      : /([0-9]+\,){15}[0-9]+/
-NUMBER      : /[-+]?[0-9]*\.?[0-9]+/
-SYMBOL      : /[.A-Z-a-z_][A-Za-z0-9_]*/
-REG         : /GPR[0-9]+/i
-SREG         : /GPR[0-9]+/i
-COMMENT     : /#.*/
-{'skip'}
-
-type: SINGLE 
-     |DOUBLE
-	 |INT
-
-align: ALIGN <commit> NUMBER
-{
-{FUNC => 'as::align',
- ARGS => ["$item{NUMBER}[1]"]}
-}
-
-ASMCODE     : /[A-Za-z1-9.:]+.*/
-{
-{FUNC => 'as::emit_code',
- ARGS => [$item[1]]}
-}
-
-function:  FUNC SYMBOL block
-{[
- {FUNC => 'as::function_entry',
-  ARGS => [$item{SYMBOL}[1],0]},
- $item{block},
- {FUNC => 'as::function_exit',
-  ARGS => [$item{SYMBOL}[1]]}
-]}
-
-function_allocate:  FUNC SYMBOL ALLOCATE NUMBER block
-{[
- {FUNC => 'as::function_entry',
-  ARGS => [$item{SYMBOL}[1],$item{NUMBER}[1]]},
- $item{block},
- {FUNC => 'as::function_exit',
-  ARGS => [$item{SYMBOL}[1]]}
-]}
-
-loop:  LOOP SYMBOL INUMBER SNUMBER block
-{[
-{FUNC => 'as::loop_entry',
- ARGS => [$item{SYMBOL}[1],$item{SNUMBER}[1][1]]},
- $item{block},
-{FUNC => 'as::loop_exit',
- ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
-]}
-| LOOP SYMBOL INUMBER SREG block
-{[
-{FUNC => 'as::loop_entry',
- ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
- $item{block},
-{FUNC => 'as::loop_exit',
- ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
-]}
-
-timer: START TIMER
-{
-{FUNC => 'isa::start_timer',
- ARGS => []}
-}
-| STOP TIMER
-{
-{FUNC => 'isa::stop_timer',
- ARGS => []}
-}
-
-mode:  START LOCAL
-{
-{FUNC => 'as::mode',
- ARGS => [$item[1][1]]}
-}
-| STOP LOCAL
-{
-{FUNC => 'as::mode',
- ARGS => [$item[1][1]]}
-}
-
-block: '{' expression(s) '}'
-{ $item[2] }
-
-define_data: DEFINE type  SYMBOL  OFFSET
-{
-{FUNC => 'as::define_offset',
- ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{OFFSET}[1]"]}
-}
-
-define_data: DEFINE type  SYMBOL  NUMBER
-{
-{FUNC => 'as::define_data',
- ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{NUMBER}[1]"]}
-}
-
-
-expression:  align
-            |COMMENT
-            |loop
-            |timer
-            |mode
-			|ASMCODE
-{ $item[1] }
-
-instruction : define_data
-            | align
-            | COMMENT
-            | mode
-            | function
-            | function_allocate
-{ $item[1] }
-
-startrule: instruction(s)
-{ $item[1] }
-
-_EOGRAMMAR_
-
-
-#=======================================
-# MAIN
-#=======================================
-init();
-print "INFO: Calling cpp with arguments $CPP_ARGS.\n" if ($VERBOSE);
-my $text = `cpp -x assembler-with-cpp $CPP_ARGS $INPUTFILE.pas`;
-
-if ($OPT{p}) {
-	open FILE,">$INPUTFILE.Pas";
-	print FILE $text;
-	close FILE;
-}
-
-open STDOUT,">$OUTPUTFILE";
-print "$as::AS->{HEADER}\n";
-
-my $parser = new Parse::RecDescent ($main::grammar)  or die "ERROR: Bad grammar!\n";
-my $parse_tree = $parser->startrule($text) or print STDERR "ERROR: Syntax Error\n";
-tree_exec($parse_tree);
-
-if ($DEBUG) {
-	open FILE,'>parse_tree.txt';
-	print FILE Dumper $parse_tree,"\n";
-	close FILE;
-}
-
-print "$as::AS->{FOOTER}\n";
-
-sub tree_exec 
-{
-	my $tree = shift;
-
-	foreach my $node (@$tree) {
-		if ($node !~ /^skip|^instruction|^expression|^loop/) {
-			if (ref($node) eq 'ARRAY')  {
-				tree_exec($node);
-			}else {
-				if (ref($node) eq 'HASH') {
-					&{$node->{FUNC}}(@{$node->{ARGS}});
-				}
-			}
-		}
-	}
-}
-
-
diff --git a/perl/feedGnuplot b/perl/feedGnuplot
index 67aaf37..d379981 100755
--- a/perl/feedGnuplot
+++ b/perl/feedGnuplot
@@ -1,27 +1,36 @@
 #!/usr/bin/perl
+
+package feedgnuplot; # for the metacpan indexer
+
 use strict;
 use warnings;
 use Getopt::Long;
-use Time::HiRes qw( usleep );
+use Time::HiRes qw( usleep gettimeofday tv_interval );
 use IO::Handle;
 use List::Util qw( first );
+use Scalar::Util qw( looks_like_number );
 use Text::ParseWords;
 use threads;
 use threads::shared;
 use Thread::Queue;
 use Pod::Usage;
+use Time::Piece;
 
-
-our $VERSION = '1.11';
+my $VERSION = 1.34;
 
 my %options;
-interpretCommandline(\%options);
+interpretCommandline();
+
+# list containing the plot data. Each element is a hashref of parameters.
+# $curve->{datastring} is a string of all the data in this curve that can be
+# sent directly to gnuplot. $curve->{datastring_meta} is a hashref {domain =>
+# ..., offset_start => ...}. offset_start represents a position in the
+# datastring where this particular data element begins. As the data is culled
+# with --xlen, the offsets are preserved by using $curve->{datastring_offset} to
+# represent the offset IN THE ORIGINAL STRING of the current start of the
+# datastring
 
-my $gnuplotVersion = getGnuplotVersion();
 
-# list containing the plot data. Each element is a reference to a list, representing the data for
-# one curve. The first 'point' is a hash describing various curve parameters. The rest are all
-# references to lists of (x,y) tuples
 my @curves = ();
 
 # list mapping curve names to their indices in the @curves list
@@ -29,24 +38,32 @@ my %curveIndices = ();
 
 # now start the data acquisition and plotting threads
 my $dataQueue;
-my $xwindow;
+
+# Whether any new data has arrived since the last replot
+my $haveNewData;
+
+# when the last replot happened
+my $last_replot_time = [gettimeofday];
+
+# whether the previous replot was timer based
+my $last_replot_is_from_timer = 1;
 
 my $streamingFinished : shared = undef;
+
 if($options{stream})
 {
-  if( $options{hardcopy})
-  {
-    $options{stream} = undef;
-  }
-
   $dataQueue  = Thread::Queue->new();
   my $addThr  = threads->create(\&mainThread);
-  my $plotThr = threads->create(\&plotThread);
+
+  # spawn the plot updating thread. If I'm replotting from a data trigger, I don't need this
+  my $plotThr = threads->create(\&plotUpdateThread) if $options{stream} > 0;
 
   while(<>)
   {
     chomp;
 
+    last if /^exit/;
+
     # place every line of input to the queue, so that the plotting thread can process it. if we are
     # using an implicit domain (x = line number), then we send it on the data queue also, since
     # $. is not meaningful in the plotting thread
@@ -58,8 +75,9 @@ if($options{stream})
   }
 
   $streamingFinished = 1;
+  $dataQueue->enqueue(undef);
 
-  $plotThr->join();
+  $plotThr->join() if defined $plotThr;
   $addThr->join();
 }
 else
@@ -81,86 +99,296 @@ sub interpretCommandline
     unshift @ARGV, shellwords shift @ARGV;
   }
 
-  my $options = shift;
-
   # everything off by default:
   # do not stream in the data by default
   # point plotting by default.
   # no monotonicity checks by default
+  # normal histograms by default
   $options{ maxcurves } = 100;
+  $options{ histstyle}  = 'freq';
+
+  # Previously I was using 'legend=s%' and 'curvestyle=s%' for curve addressing. This had cleaner
+  # syntax, but disregarded the order of the given options. This resulted in arbitrarily ordered
+  # curves. I thus make parse these into lists, and then also make hashes, for later use
+
+  # needed for these to be parsed into an array-ref, these default to []
+  $options{legend}     = [];
+  $options{curvestyle} = [];
+  $options{style}      = [];
+  $options{histogram}  = [];
+  $options{y2}         = [];
+  $options{extracmds}  = [];
+  $options{set}        = [];
+  $options{unset}      = [];
+
+  $options{curvestyleall} = '';
+  $options{styleall}      = '';
+  $options{with}          = '';
+
+  $options{rangesize} = [];
+
+  GetOptions(\%options, 'stream:s', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
+             'circles', 'legend=s{2}', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
+             'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=s', 'xmax=s', 'y2min=f', 'y2max=f',
+             'zmin=f', 'zmax=f', 'y2=s@',
+             'style=s{2}', 'curvestyle=s{2}', 'curvestyleall=s', 'styleall=s', 'with=s', 'extracmds=s@', 'set=s@', 'unset=s@',
+             'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!', 'timefmt=s',
+             'histogram=s@', 'binwidth=f', 'histstyle=s',
+             'terminal=s',
+             'rangesize=s{2}', 'rangesizeall=i', 'extraValuesPerPoint=i',
+             'help', 'dump', 'exit', 'version',
+             'geometry=s') or pod2usage( -exitval => 1,
+                                         -verbose => 1, # synopsis and args
+                                         -output  => \*STDERR );
 
-  GetOptions($options, 'stream!', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
-             'circles', 'legend=s%', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
-             'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=f', 'xmax=f', 'y2min=f', 'y2max=f',
-             'zmin=f', 'zmax=f', 'y2=s@', 'curvestyle=s%', 'curvestyleall=s', 'extracmds=s@',
-             'size=s', 'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!',
-             'extraValuesPerPoint=i', 'help', 'dump') or pod2usage(1);
 
   # handle various cmdline-option errors
-  if ( $options->{help} )
-  { pod2usage(0); }
+  if ( $options{help} )
+  {
+    pod2usage( -exitval => 0,
+               -verbose => 1, # synopsis and args
+               -output  => \*STDOUT );
+  }
+
+  if( $options{version} )
+  {
+    print "feedgnuplot version $VERSION\n";
+    exit 0;
+  }
+
+  # expand options that are given as comma-separated lists
+  for my $listkey (qw(histogram y2))
+  {
+    @{$options{$listkey}} = map split('\s*,\s*', $_), @{$options{$listkey}}
+      if defined $options{$listkey};
+  }
+
+  # --style and --curvestyle are synonyms, as are --styleall and
+  # --curvestyleall, so fill that in
+  if( $options{styleall} )
+  {
+    if($options{curvestyleall} )
+    {
+      $options{curvestyleall} .= " $options{styleall}";
+    }
+    else
+    {
+      $options{curvestyleall} = $options{styleall};
+    }
+  }
+  push @{$options{curvestyle}}, @{$options{style}};
+
+
+  # --legend and --curvestyle options are conceptually hashes, but are parsed as
+  # arrays in order to preserve the ordering. I parse both of these into hashes
+  # because those are useful to have later. After this I can access individual
+  # legends with $options{legend_hash}{curveid}
+  for my $listkey (qw(legend curvestyle rangesize))
+  {
+    $options{"${listkey}_hash"} = {};
+
+    my $n = scalar @{$options{$listkey}}/2;
+    foreach my $idx (0..$n-1)
+    {
+      $options{"${listkey}_hash"}{$options{$listkey}[$idx*2]} = $options{$listkey}[$idx*2 + 1];
+    }
+  }
+
+  if ( defined $options{hardcopy} && defined $options{stream} )
+  {
+    print STDERR "--stream doesn't make sense together with --hardcopy\n";
+    exit -1;
+  }
+
+  if ( defined $options{rangesizeall} && defined $options{extraValuesPerPoint} )
+  {
+    print STDERR "Only one of --rangesizeall and --extraValuesPerPoint may be given\n";
+    exit -1;
+  }
+
+
+  # I now set up the rangesize to always be
+  #  $options{rangesize_hash}{$id} // $options{rangesize_default}
+  if ( $options{rangesizeall} )
+  {
+      $options{rangesize_default} = $options{rangesizeall};
+  }
+  else
+  {
+      $options{rangesize_default} = 1;
+
+      $options{rangesize_default} += $options{extraValuesPerPoint} if ($options{extraValuesPerPoint});
+      $options{rangesize_default}++                                if ($options{colormap});
+      $options{rangesize_default}++                                if ($options{circles} );
+  }
 
-  $options->{curvestyleall} = '' unless defined $options->{curvestyleall};
 
-  if ($options->{colormap})
+  # parse stream option. Allowed only numbers >= 0 or 'trigger'. After this code
+  # $options{stream} is
+  #  -1 for triggered replotting
+  #  >0 for timed replotting
+  #  undef if not streaming
+  if(defined $options{stream})
+  {
+    # if no streaming period is given, default to 1Hz.
+    $options{stream} = 1 if $options{stream} eq '';
+
+    if( !looks_like_number $options{stream} )
+    {
+      if($options{stream} eq 'trigger')
+      {
+        $options{stream} = 0;
+      }
+      else
+      {
+        print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+        exit -1;
+      }
+    }
+
+    if ( $options{stream} == 0 )
+    {
+      $options{stream} = -1;
+    }
+    elsif ( $options{stream} <= 0)
+    {
+      print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+      exit -1;
+    }
+  }
+
+  if( $options{curvestyleall} && $options{with} )
+  {
+    print STDERR "--curvestyleall and --with are mutually exclusive. Please just use one.\n";
+    exit -1;
+  }
+  if( $options{with} )
+  {
+    $options{curvestyleall} = "with $options{with}";
+    $options{with} = '';
+  }
+
+  if ($options{colormap})
   {
     # colormap styles all curves with palette. Seems like there should be a way to do this with a
     # global setting, but I can't get that to work
-    $options->{curvestyleall} .= ' palette';
+    $options{curvestyleall} .= ' palette';
   }
 
-  if ( $options->{'3d'} )
+  if ( $options{'3d'} )
   {
-    if ( !$options->{domain} )
+    if ( !$options{domain} )
     {
       print STDERR "--3d only makes sense with --domain\n";
       exit -1;
     }
 
-    if ( defined $options->{y2min} || defined $options->{y2max} || defined $options->{y2} )
+    if ( $options{timefmt} )
+    {
+      print STDERR "--3d makes no sense with --timefmt\n";
+      exit -1;
+    }
+
+    if ( defined $options{y2min} || defined $options{y2max} || @{$options{y2}} )
     {
       print STDERR "--3d does not make sense with --y2...\n";
       exit -1;
     }
 
-    if ( defined $options->{xlen} )
+    if ( defined $options{xlen} )
     {
       print STDERR "--3d does not make sense with --xlen\n";
       exit -1;
     }
 
-    if ( defined $options->{monotonic} )
+    if ( defined $options{monotonic} )
     {
       print STDERR "--3d does not make sense with --monotonic\n";
       exit -1;
     }
+
+    if ( defined $options{binwidth} || @{$options{histogram}} )
+    {
+      print STDERR "--3d does not make sense with histograms\n";
+      exit -1;
+    }
+
+    if ( defined $options{circles} )
+    {
+      print STDERR "--3d does not make sense with circles (gnuplot doesn't support this)\n";
+      exit -1;
+    }
   }
   else
   {
-    if(!$options->{colormap})
+    if ( $options{timefmt} && !$options{domain} )
+    {
+      print STDERR "--timefmt makes sense only with --domain\n";
+      exit -1;
+    }
+
+    if(!$options{colormap})
     {
-      if ( defined $options->{zmin} || defined $options->{zmax} || defined $options->{zlabel} )
+      if ( defined $options{zmin} || defined $options{zmax} || defined $options{zlabel} )
       {
         print STDERR "--zmin/zmax/zlabel only makes sense with --3d or --colormap\n";
         exit -1;
       }
     }
 
-    if ( defined $options->{square_xy} )
+    if ( defined $options{square_xy} )
     {
       print STDERR "--square_xy only makes sense with --3d\n";
       exit -1;
     }
   }
 
-  if(defined $options{xlen} && !defined $options{stream} )
+  if(defined $options{xlen} && !$options{stream} )
   {
     print STDERR "--xlen does not make sense without --stream\n";
     exit -1;
   }
 
+  if($options{stream} && defined $options{xlen} &&
+     ( defined $options{xmin} || defined $options{xmax}))
+  {
+    print STDERR "With --stream and --xlen the X bounds are set, so neither --xmin nor --xmax make sense\n";
+    exit -1;
+  }
+
   # --xlen implies an order to the data, so I force monotonicity
-  $options{monotonic} = defined $options{xlen};
+  $options{monotonic} = 1 if defined $options{xlen};
+
+  if( $options{histstyle} !~ /freq|cum|uniq|cnorm/ )
+  {
+    print STDERR "unknown histstyle. Allowed are 'freq...', 'cum...', 'uniq...', 'cnorm...'\n";
+    exit -1;
+  }
+
+  # deal with timefmt
+  if ( $options{timefmt} )
+  {
+    # I need to compute a regex to match the time field and I need to count how
+    # many whilespace-separated fields there are.
+
+    # strip leading and trailing whitespace
+    $options{timefmt} =~ s/^\s*//;
+    $options{timefmt} =~ s/\s*$//;
+
+    my $Nfields = () = split /\s+/, $options{timefmt}, -1;
+    $options{timefmt_Ncols} = $Nfields;
+
+    # make sure --xlen is an integer. With a timefmt xlen goes through strptime
+    # and strftime, and those are integer-only
+    if( defined $options{xlen} )
+    {
+      if( $options{xlen} - int($options{xlen}) )
+      {
+        say STDERR "When streaming --xlen MUST be an integer. Rounding up to the nearest second";
+        $options{xlen} = 1 + int($options{xlen});
+      }
+    }
+  }
 }
 
 sub getGnuplotVersion
@@ -177,31 +405,60 @@ sub getGnuplotVersion
   return $gnuplotVersion;
 }
 
-sub plotThread
+sub plotUpdateThread
 {
   while(! $streamingFinished)
   {
-    sleep(1);
-    $dataQueue->enqueue('Plot now');
+    usleep( $options{stream} * 1e6 );
+
+    # indicate that the timer was the replot source
+    $dataQueue->enqueue('replot timertick');
   }
+}
 
-  $dataQueue->enqueue(undef);
+sub sendRangeCommand
+{
+  my ($name, $min, $max) = @_;
+
+  return unless defined $min || defined $max;
+
+  if( defined $min )
+  { $min = "\"$min\""; }
+  else
+  { $min = ''; }
+
+  if( defined $max )
+  { $max = "\"$max\""; }
+  else
+  { $max = ''; }
 
+  my $cmd = "set $name [$min:$max]\n";
+  print PIPE $cmd;
 }
 
-sub mainThread
+sub makeDomainNumeric
 {
-    my $valuesPerPoint = 1;
-    if($options{extraValuesPerPoint}) { $valuesPerPoint += $options{extraValuesPerPoint}; }
-    if($options{colormap})            { $valuesPerPoint++; }
-    if($options{circles} )            { $valuesPerPoint++; }
+  my ($domain0) = @_;
+
+  if ( $options{timefmt} )
+  {
+    my $timepiece = Time::Piece->strptime( $domain0, $options{timefmt} )
+      or die "Couldn't parse time format. String '$domain0' doesn't fit format '$options{timefmt}'";
+
+    return $timepiece->epoch();
+  }
+
+  return $domain0;
+}
 
+sub mainThread
+{
     local *PIPE;
     my $dopersist = '';
 
-    if($gnuplotVersion >= 4.3)
+    if( !$options{stream} && getGnuplotVersion() >= 4.3)
     {
-      $dopersist = '--persist' if(!$options{stream});
+      $dopersist = '--persist';
     }
 
     if(exists $options{dump})
@@ -210,51 +467,43 @@ sub mainThread
     }
     else
     {
-      open PIPE, "|gnuplot $dopersist" or die "Can't initialize gnuplot\n";
+      my $geometry = defined $options{geometry} ?
+        "-geometry $options{geometry}" : '';
+      open PIPE, "|gnuplot $geometry $dopersist" or die "Can't initialize gnuplot\n";
     }
     autoflush PIPE 1;
 
     my $outputfile;
     my $outputfileType;
-    if( $options{hardcopy})
+    if( defined $options{hardcopy})
     {
       $outputfile = $options{hardcopy};
-      ($outputfileType) = $outputfile =~ /\.(eps|ps|pdf|png)$/;
-      if(!$outputfileType) { die("Only .eps, .ps, .pdf and .png supported\n"); }
+      if( $outputfile =~ /^[^|]                       # starts with anything other than |
+                          .*                          # stuff in the middle
+                          \.(eps|ps|pdf|png|svg)$/ix) # ends with a known extension
+      {
+        $outputfileType = lc $1;
+      }
 
       my %terminalOpts =
       ( eps  => 'postscript solid color enhanced eps',
         ps   => 'postscript solid color landscape 10',
         pdf  => 'pdfcairo solid color font ",10" size 11in,8.5in',
-        png  => 'png size 1280,1024' );
-
-      print PIPE "set terminal $terminalOpts{$outputfileType}\n";
-      print PIPE "set output \"$outputfile\"\n";
-    }
-    else
-    {
-      print PIPE "set terminal x11\n";
-    }
+        png  => 'png size 1280,1024',
+        svg  => 'svg');
 
-    # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
-    # gnuplot
-    $options{xmin}  = '' unless defined $options{xmin};
-    $options{xmax}  = '' unless defined $options{xmax};
-    $options{ymin}  = '' unless defined $options{ymin};
-    $options{ymax}  = '' unless defined $options{ymax};
-    $options{y2min} = '' unless defined $options{y2min};
-    $options{y2max} = '' unless defined $options{y2max};
-    $options{zmin}  = '' unless defined $options{zmin};
-    $options{zmax}  = '' unless defined $options{zmax};
+      if( !defined $options{terminal} &&
+           defined $outputfileType    &&
+           $terminalOpts{$outputfileType} )
+      {
+        $options{terminal} = $terminalOpts{$outputfileType};
+      }
 
-    print PIPE "set xtics\n";
-    if($options{y2})
-    {
-      print PIPE "set ytics nomirror\n";
-      print PIPE "set y2tics\n";
-      # if any of the ranges are given, set the range
-      print PIPE "set y2range [". $options{y2min} . ":" . $options{y2max} ."]\n" if length( $options{y2min} . $options{y2max} );
+      die "Asked to plot to file '$outputfile', but I don't know which terminal to use, and no --terminal given"
+        unless $options{terminal};
     }
+    print PIPE "set terminal $options{terminal}\n" if $options{terminal};
+    print PIPE "set output \"$outputfile\"\n"      if $outputfile;
 
     # set up plotting style
     my $style = '';
@@ -265,94 +514,143 @@ sub mainThread
       $options{curvestyleall} = "with circles $options{curvestyleall}";
     }
 
-    # if any of the ranges are given, set the range
-    print PIPE "set xrange [". $options{xmin} . ":" . $options{xmax} ."]\n" if length( $options{xmin} . $options{xmax} );
-    print PIPE "set yrange [". $options{ymin} . ":" . $options{ymax} ."]\n" if length( $options{ymin} . $options{ymax} );
-    print PIPE "set zrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
     print PIPE "set style data $style\n" if $style;
     print PIPE "set grid\n";
 
-    print(PIPE "set xlabel  \"" . $options{xlabel } . "\"\n") if defined $options{xlabel};
-    print(PIPE "set ylabel  \"" . $options{ylabel } . "\"\n") if defined $options{ylabel};
-    print(PIPE "set zlabel  \"" . $options{zlabel } . "\"\n") if defined $options{zlabel};
-    print(PIPE "set y2label \"" . $options{y2label} . "\"\n") if defined $options{y2label};
-    print(PIPE "set title   \"" . $options{title  } . "\"\n") if defined $options{title};
+    print(PIPE "set xlabel  \"$options{xlabel }\"\n") if defined $options{xlabel};
+    print(PIPE "set ylabel  \"$options{ylabel }\"\n") if defined $options{ylabel};
+    print(PIPE "set zlabel  \"$options{zlabel }\"\n") if defined $options{zlabel};
+    print(PIPE "set y2label \"$options{y2label}\"\n") if defined $options{y2label};
+    print(PIPE "set title   \"$options{title  }\"\n") if defined $options{title};
 
     if($options{square})
     {
       # set a square aspect ratio. Gnuplot does this differently for 2D and 3D plots
       if(! $options{'3d'})
       {
-        $options{size} = '' unless defined $options{size};
-        $options{size} .= ' ratio -1';
+        print(PIPE "set size ratio -1\n");
       }
       else
       {
         print(PIPE "set view equal xyz\n");
       }
     }
-    print(PIPE "set size $options{size}\n")                     if defined $options{size};
 
     if($options{square_xy})
     {
       print(PIPE "set view equal xy\n");
     }
 
-    if($options{colormap})
-    {
-      print PIPE "set cbrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
-    }
-
 # For the specified values, set the legend entries to 'title "blah blah"'
-    if($options{legend})
+    if(@{$options{legend}})
     {
-      foreach my $id (keys %{$options{legend}})
+      # @{$options{legend}} is a list where consecutive pairs are (curveID,
+      # legend). I use $options{legend} here instead of $options{legend_hash}
+      # because I create a new curve when I see a new one, and the hash is
+      # unordered, thus messing up the ordering
+      my $n = scalar @{$options{legend}}/2;
+      foreach my $idx (0..$n-1)
       {
-        setCurveLabel($id, $options{legend}{$id});
+        setCurveLabel($options{legend}[$idx*2    ],
+                      $options{legend}[$idx*2 + 1]);
       }
     }
 
 # add the extra curve options
-    if($options{curvestyle})
+    if(@{$options{curvestyle}})
     {
-      foreach my $id (keys %{$options{curvestyle}})
+      # @{$options{curvestyle}} is a list where consecutive pairs are (curveID,
+      # style). I use $options{curvestyle} here instead of
+      # $options{curvestyle_hash} because I create a new curve when I see a new
+      # one, and the hash is unordered, thus messing up the ordering
+      my $n = scalar @{$options{curvestyle}}/2;
+      foreach my $idx (0..$n-1)
       {
-        addCurveOption($id, $options{curvestyle}{$id});
+        addCurveOption($options{curvestyle}[$idx*2    ],
+                       $options{curvestyle}[$idx*2 + 1]);
       }
     }
 
 # For the values requested to be printed on the y2 axis, set that
-    foreach (@{$options{y2}})
+    addCurveOption($_, 'axes x1y2') foreach (@{$options{y2}});
+
+# timefmt
+    if( $options{timefmt} )
     {
-      addCurveOption($_, 'axes x1y2 linewidth 3');
+      print(PIPE "set timefmt '$options{timefmt}'\n");
+      print(PIPE "set xdata time\n");
     }
 
 # add the extra global options
-    if($options{extracmds})
+    print(PIPE "$_\n")       foreach (@{$options{extracmds}});
+    print(PIPE "set $_\n")   foreach (@{$options{set}});
+    print(PIPE "unset $_\n") foreach (@{$options{unset}});
+
+# set up histograms
+    $options{binwidth} ||= 1;   # if no binwidth given, set it to 1
+    print PIPE
+      "set boxwidth $options{binwidth}\n" .
+      "histbin(x) = $options{binwidth} * floor(0.5 + x/$options{binwidth})\n";
+
+    setCurveAsHistogram( $_ ) foreach (@{$options{histogram}});
+
+# set all the axis ranges
+    # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
+    # gnuplot
+    print PIPE "set xtics\n";
+
+    if(@{$options{y2}})
     {
-      foreach (@{$options{extracmds}})
-      {
-        print(PIPE "$_\n");
-      }
+      print PIPE "set ytics nomirror\n";
+      print PIPE "set y2tics\n";
+      # if any of the ranges are given, set the range
+      sendRangeCommand( "y2range", $options{y2min}, $options{y2max} );
     }
 
-    # regexp for a possibly floating point, possibly scientific notation number
-    my $numRE   = '-?\d*\.?\d+(?:[Ee][-+]?\d+)?';
+    # if any of the ranges are given, set the range
+    sendRangeCommand( "xrange",  $options{xmin}, $options{xmax} );
+    sendRangeCommand( "yrange",  $options{ymin}, $options{ymax} );
+    sendRangeCommand( "zrange",  $options{zmin}, $options{zmax} );
+    sendRangeCommand( "cbrange", $options{zmin}, $options{zmax} ) if($options{colormap});
+
 
-    # a point may be preceded by an id
-    my $pointRE = $options{dataid} ? '(\w+)\s+' : '()';
-    $pointRE .= '(' . join('\s+', ($numRE) x $valuesPerPoint) . ')';
-    $pointRE = qr/$pointRE/;
 
+
+    # latest domain variable present in our data
+    my $latestX;
+
+    # The domain of the current point
     my @domain;
-    my $haveNewData;
+
+    # The x-axis domain represented as a number. This is exactly the same as
+    # $domain[0] unless the x-axis domain uses a timefmt. Then this is the
+    # number of seconds since the UNIX epoch.
+    my $domain0_numeric;
 
     # I should be using the // operator, but I'd like to be compatible with perl 5.8
     while( $_ = (defined $dataQueue ? $dataQueue->dequeue() : <>))
     {
       next if /^#/o;
 
-      if($_ ne 'Plot now')
+      if( $options{stream} )
+      {
+        if(/^clear/o )
+        {
+          clearCurves();
+          next;
+        }
+
+        if(/^replot/o )
+        {
+          # /timertick/ determines if the timer was the source of the replot
+          replot( $domain0_numeric, /timertick/ );
+          next;
+        }
+
+        # /exit/ is handled in the data-reading thread
+      }
+
+      if(! /^replot/o)
       {
         # parse the incoming data lines. The format is
         # x id0 dat0 id1 dat1 ....
@@ -364,14 +662,49 @@ sub mainThread
         # line is used)
         # 3d plots require $options{domain}, and dictate "x y" for the domain instead of just "x"
 
+        my @fields = split;
+
         if($options{domain})
         {
-          /($numRE)/go or next;
-          $domain[0] = $1;
-          if($options{'3d'})
+          if( $options{timefmt} )
           {
-            /($numRE)/go or next;
-            $domain[1] = $1;
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < $options{timefmt_Ncols}+1;
+
+              $domain[0] = join (' ', splice( @fields, 0, $options{timefmt_Ncols}) );
+              $domain0_numeric = makeDomainNumeric( $domain[0] );
+          }
+          elsif(!$options{'3d'})
+          {
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < 1+1;
+
+              $domain[0] = $domain0_numeric = shift @fields;
+          }
+          else
+          {
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < 2+1;
+
+              @domain = splice(@fields, 0, 2);
+          }
+
+          if( $options{monotonic} )
+          {
+            if( defined $latestX && $domain0_numeric < $latestX )
+            {
+              # the x-coordinate of the new point is in the past, so I wipe out
+              # all the data and start anew. Before I wipe the old data, I
+              # replot the old data
+              replot( $domain0_numeric );
+              clearCurves();
+              $latestX = undef;
+            }
+            else
+            { $latestX = $domain0_numeric; }
           }
         }
         else
@@ -380,53 +713,67 @@ sub mainThread
           # $. on the data queue in that case
           if(defined $dataQueue)
           {
-            s/ ([\d]+)$//o;
-            $domain[0] = $1;
+            $domain[0] = pop @fields;
           }
           else
           {
             $domain[0] = $.;
           }
+          $domain0_numeric = makeDomainNumeric( $domain[0] );
         }
 
         my $id = -1;
-        while (/$pointRE/go)
-        {
-          if($1 ne '') {$id = $1;}
-          else         {$id++;   }
-
-          $haveNewData = 1;
-          pushPoint(getCurve($id),
-                    [@domain, split( /\s+/, $2)]);
-        }
-      }
-
-      elsif($options{stream})
-      {
-        # only redraw a streaming plot if there's new data to plot
-        next unless $haveNewData;
-        $haveNewData = undef;
 
-        if( $options{xlen} )
+        while(@fields)
         {
-          pruneOldData($domain[0] - $options{xlen});
-          plotStoredData($domain[0] - $options{xlen}, $domain[0]);
+            if($options{dataid})
+            {
+                $id = shift @fields;
+            }
+            else
+            {
+                $id++;
+            }
+
+            # I'd like to use //, but I guess some people are still on perl 5.8
+            my $rangesize = exists $options{rangesize_hash}{$id} ?
+              $options{rangesize_hash}{$id} :
+              $options{rangesize_default};
+
+            last if @fields < $rangesize;
+
+            pushPoint(getCurve($id),
+                      join(' ',
+                           @domain,
+                           splice( @fields, 0, $rangesize ) ) . "\n",
+                      $domain0_numeric);
         }
-        else
-        { plotStoredData(); }
       }
     }
 
+    # if we were streaming, we're now done!
+    if( $options{stream} )
+    {
+      return;
+    }
+
     # finished reading in all. Plot what we have
     plotStoredData();
 
-    if ( $options{hardcopy})
+    if ( defined $options{hardcopy})
     {
       print PIPE "set output\n";
-      # sleep until the plot file exists, and it is closed. Sometimes the output is
-      # still being written at this point
-      usleep(100_000) until -e $outputfile;
-      usleep(100_000) until(system("fuser -s \"$outputfile\""));
+
+      # sleep until the plot file exists, and it is closed. Sometimes the output
+      # is still being written at this point. If the output filename starts with
+      # '|', gnuplot pipes the output to that process, instead of writing to a
+      # file. In that case I don't make sure the file exists, since there IS not
+      # file
+      if( $options{hardcopy} !~ /^\|/ )
+      {
+        usleep(100_000) until -e $outputfile;
+        usleep(100_000) until(system("fuser -s \"$outputfile\""));
+      }
 
       print "Wrote output to $outputfile\n";
       return;
@@ -435,46 +782,53 @@ sub mainThread
     # we persist gnuplot, so we shouldn't need this sleep. However, once
     # gnuplot exits, but the persistent window sticks around, you can no
     # longer interactively zoom the plot. So we still sleep
-    sleep(100000);
+    sleep(100000) unless $options{dump} || $options{exit};
 }
 
 sub pruneOldData
 {
   my ($oldestx) = @_;
 
-  foreach my $xy (@curves)
+  foreach my $curve (@curves)
   {
-    if( @$xy > 1 )
+    next unless $curve->{datastring};
+
+    my $meta = $curve->{datastring_meta};
+
+    my $firstInWindow = first {$meta->[$_]{domain} >= $oldestx} 0..$#$meta;
+    if ( !defined $firstInWindow )
     {
-      if( my $firstInWindow = first {$xy->[$_][0] >= $oldestx} 1..$#$xy )
-      { splice( @$xy, 1, $firstInWindow-1 ); }
-      else
-      { splice( @$xy, 1); }
+      # everything is too old. Clear out all the data
+      $curve->{datastring}        = '';
+      $curve->{datastring_meta}   = [];
+      $curve->{datastring_offset} = 0;
+    }
+    elsif ( $firstInWindow >= 2 )
+    {
+      # clear out everything that's too old, except for one point. This point
+      # will be off the plot, but if we're plotting lines there will be a
+      # connecting line to it. Some of the line will be visible
+      substr( $curve->{datastring}, 0,
+              $meta->[$firstInWindow-1]{offset_start} - $curve->{datastring_offset},
+              '' );
+      $curve->{datastring_offset} = $meta->[$firstInWindow-1]{offset_start};
     }
   }
 }
 
 sub plotStoredData
 {
-  my ($xmin, $xmax) = @_;
-  print PIPE "set xrange [$xmin:$xmax]\n" if defined $xmin;
+  # get the options for those curves that havse any data
+  my @nonemptyCurves = grep { $_->{datastring} } @curves;
+  my @extraopts = map {$_->{options}} @nonemptyCurves;
 
-  # get the options for those curves that have any data
-  my @nonemptyCurves = grep {@$_ > 1} @curves;
-  my @extraopts = map {$_->[0]{options}} @nonemptyCurves;
-
-  my $body = join(', ' , map({ '"-"' . $_} @extraopts) );
+  my $body = join(', ' , map({ "'-' $_" } @extraopts) );
   if($options{'3d'}) { print PIPE "splot $body\n"; }
   else               { print PIPE  "plot $body\n"; }
 
-  foreach my $buf (@nonemptyCurves)
+  foreach my $curve (@nonemptyCurves)
   {
-    # send each point to gnuplot. Ignore the first "point" since it's the
-    # curve options
-    for my $elem (@{$buf}[1..$#$buf])
-    {
-      print PIPE "@$elem\n";
-    }
+    print PIPE $curve->{datastring};
     print PIPE "e\n";
   }
 }
@@ -486,19 +840,51 @@ sub updateCurveOptions
   # case. When no title is specified, gnuplot will still add a legend entry with an unhelpful '-'
   # label. Thus I explicitly do 'notitle' for that case
 
-  my ($curveoptions, $id) = @_;
+  my ($curve, $id) = @_;
 
   # use the given title, unless we're generating a legend automatically. Given titles
   # override autolegend
   my $title;
-  if(defined $curveoptions->{title})
-  { $title = $curveoptions->{title}; }
+  if(defined $curve->{title})
+  { $title = $curve->{title}; }
   elsif( $options{autolegend} )
   { $title = $id; }
 
   my $titleoption = defined $title ? "title \"$title\"" : "notitle";
-  my $extraoption = defined $options{curvestyleall} ? $options{curvestyleall} : '';
-  $curveoptions->{options} = "$titleoption $curveoptions->{extraoptions} $extraoption";
+
+  my ($curvestyleall);
+  if( defined $options{curvestyle_hash}{$id} )
+  {
+    # I have a curve-specific style set with --curvestyle. This style lives in
+    # $curve->{extraoptions}, and it overrides the global styles
+    $curvestyleall = '';
+  }
+  else
+  {
+    $curvestyleall = $options{curvestyleall};
+  }
+
+  my $histoptions = $curve->{histoptions} || '';
+
+  my $usingoptions = '';
+  if( $options{timefmt} )
+  {
+      # with --timefmt I need an explicit 'using' specification. I specify the
+      # columns as 1:2:3..... I need the right number of columns (this is given
+      # as 1 + rangesize). I also need to start the range at the first column
+      # past the timefmt
+
+      # I'd like to use //, but I guess some people are still on perl 5.8
+      my $rangesize = exists $options{rangesize_hash}{$id} ?
+        $options{rangesize_hash}{$id} :
+        $options{rangesize_default};
+
+      my @rest = map {$_ + $options{timefmt_Ncols}} (1..$rangesize);
+
+      $usingoptions = "using 1:" . join(':', @rest);
+  }
+
+  $curve->{options} = "$histoptions $usingoptions $titleoption $curve->{extraoptions} $curvestyleall";
 }
 
 sub getCurve
@@ -510,17 +896,20 @@ sub getCurve
   {
     print STDERR "Tried to exceed the --maxcurves setting.\n";
     print STDERR "Invoke with a higher --maxcurves limit if you really want to do this.\n";
-    exit;
+    exit -1;
   }
 
   my ($id) = @_;
 
   if( !exists $curveIndices{$id} )
   {
-    push @curves, [{extraoptions => ' '}]; # push a curve with no data and no options
+    push @curves, {extraoptions      => ' ',
+                   datastring        => '',
+                   datastring_meta   => [],
+                   datastring_offset => 0}; # push a curve with no data and no options
     $curveIndices{$id} =  $#curves;
 
-    updateCurveOptions($curves[$#curves][0], $id);
+    updateCurveOptions($curves[$#curves], $id);
   }
   return $curves[$curveIndices{$id}];
 }
@@ -530,8 +919,8 @@ sub addCurveOption
   my ($id, $str) = @_;
 
   my $curve = getCurve($id);
-  $curve->[0]{extraoptions} .= "$str ";
-  updateCurveOptions($curve->[0], $id);
+  $curve->{extraoptions} .= "$str ";
+  updateCurveOptions($curve, $id);
 }
 
 sub setCurveLabel
@@ -539,37 +928,114 @@ sub setCurveLabel
   my ($id, $str) = @_;
 
   my $curve = getCurve($id);
-  $curve->[0]{title} = $str;
-  updateCurveOptions($curve->[0], $id);
+  $curve->{title} = $str;
+  updateCurveOptions($curve, $id);
 }
 
-# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
-sub pushPoint
+sub setCurveAsHistogram
 {
-  my ($curve, $xy) = @_;
+  my ($id, $str) = @_;
+
+  my $curve = getCurve($id);
+  $curve->{histoptions} = 'using (histbin($2)):(1.0) smooth ' . $options{histstyle};
+
+  updateCurveOptions($curve, $id);
+}
+
+# remove all the curve data
+sub clearCurves
+{
+  foreach my $curve(@curves)
+  {
+    $curve->{datastring}        = '';
+    $curve->{datastring_meta}   = [];
+    $curve->{datastring_offset} = 0;
+  }
+}
+
+sub replot
+{
+  return unless $haveNewData;
+  $haveNewData = undef;
+
+  return if !$options{stream};
 
-  if($options{monotonic})
+
+  # The logic involving domain rollover replotting due to --monotonic is a bit
+  # tricky. I want this:
+
+  # if( domain rolls over slowly )
+  # {
+  #   should update on a timer;
+  #   when the domain rolls over, --monotonic should force a replot
+  # }
+  # if( domain rolls over quickly )
+  # {
+  #   should update when the domain rolls over,
+  #     at most as quickly as the timer indicates
+  # }
+
+
+  my ($domain0_numeric, $replot_is_from_timer) = @_;
+
+  my $now = [gettimeofday];
+
+  if( # If there is no replot timer at all, replot at any indication
+      $options{stream} < 0 ||
+
+      # if the last replot was timer-based, but this one isn't, force a replot.
+      # This makes sure that a replot happens for a domain rollover shortly
+      # after a timer replot
+      !$replot_is_from_timer && $last_replot_is_from_timer ||
+
+      # if enough time has elapsed since the last replot, it's ok to replot
+      tv_interval ( $last_replot_time, $now ) > 0.8*$options{stream} )
   {
-    if( @$curve > 1 && $xy->[0] < $curve->[$#{$curve}][0] )
+    # ok, then. We really need to replot
+    if ( defined $options{xlen} )
     {
-      # the x-coordinate of the new point is in the past, so I wipe out all the data for this curve
-      # and start anew
-      splice( @$curve, 1, @$curve-1 );
+      # we have an --xlen, so we need to clean out the old data
+      pruneOldData( $domain0_numeric - $options{xlen} );
+
+      my ($xmin, $xmax) = ($domain0_numeric - $options{xlen}, $domain0_numeric);
+      if ( defined $options{timefmt} )
+      {
+        # if we're using a timefmt, I need to convert my xmin range from
+        # seconds-since-the-epoch BACK to the timefmt. Sheesh
+        ($xmin, $xmax) = map {Time::Piece->strptime( $_, '%s' )->strftime( $options{timefmt} ) } ($xmin, $xmax);
+      }
+      sendRangeCommand( "xrange", $xmin, $xmax );
     }
+
+    plotStoredData();
+
+
+    # update replot state
+    $last_replot_time          = $now;
+    $last_replot_is_from_timer = $replot_is_from_timer;
   }
+}
+
+# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
+sub pushPoint
+{
+  my ($curve, $datastring, $domain0_numeric) = @_;
+
+  push @{$curve->{datastring_meta}}, { offset_start => length( $curve->{datastring} ) + $curve->{datastring_offset},
+                                       domain       => $domain0_numeric };
+  $curve->{datastring} .= $datastring;
 
-  push @$curve, $xy;
+  $haveNewData = 1;
 }
 
-__END__
 
 =head1 NAME
 
-feedGnuplot - A pipe-oriented frontend to Gnuplot
+feedgnuplot - General purpose pipe-oriented plotting tool
 
 =head1 SYNOPSIS
 
-Simple plotting of stored data:
+Simple plotting of piped data:
 
  $ seq 5 | awk '{print 2*$1, $1*$1}'
  2 1
@@ -579,14 +1045,55 @@ Simple plotting of stored data:
  10 25
 
  $ seq 5 | awk '{print 2*$1, $1*$1}' |
-   feedGnuplot --lines --points --legend 0="data 0" --title "Test plot" --y2 1
+   feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+               --terminal 'dumb 80,40' --exit
+
+                                  Test plot
+
+  10 ++------+--------+-------+-------+-------+--------+-------+------*A 25
+     +       +        +       +       +       +        +       +    **#+
+     |       :        :       :       :       :        : data 0+**A*** |
+     |       :        :       :       :       :        :       :** #   |
+   9 ++.......................................................**.##....|
+     |       :        :       :       :       :        :    ** :#      |
+     |       :        :       :       :       :        :  **   #       |
+     |       :        :       :       :       :        :**   ##:      ++ 20
+   8 ++................................................A....#..........|
+     |       :        :       :       :       :      **:   #   :       |
+     |       :        :       :       :       :    **  : ##    :       |
+     |       :        :       :       :       :  **    :#      :       |
+     |       :        :       :       :       :**      B       :       |
+   7 ++......................................**......##................|
+     |       :        :       :       :    ** :    ##  :       :      ++ 15
+     |       :        :       :       :  **   :   #    :       :       |
+     |       :        :       :       :**     : ##     :       :       |
+   6 ++..............................*A.......##.......................|
+     |       :        :       :    ** :     ##:        :       :       |
+     |       :        :       :  **   :    #  :        :       :       |
+     |       :        :       :**     :  ##   :        :       :      ++ 10
+   5 ++......................**........##..............................|
+     |       :        :    ** :      #B       :        :       :       |
+     |       :        :  **   :    ## :       :        :       :       |
+     |       :        :**     :  ##   :       :        :       :       |
+   4 ++...............A.......###......................................|
+     |       :      **:     ##:       :       :        :       :       |
+     |       :    **  :   ##  :       :       :        :       :      ++ 5
+     |       :  **    : ##    :       :       :        :       :       |
+     |       :**    ##B#      :       :       :        :       :       |
+   3 ++.....**..####...................................................|
+     |    **####      :       :       :       :        :       :       |
+     |  **## :        :       :       :       :        :       :       |
+     B**     +        +       +       +       +        +       +       +
+   2 A+------+--------+-------+-------+-------+--------+-------+------++ 0
+     1      1.5       2      2.5      3      3.5       4      4.5      5
+
 
 Simple real-time plotting example: plot how much data is received on the wlan0
 network interface in bytes/second (uses bash, awk and Linux):
 
  $ while true; do sleep 1; cat /proc/net/dev; done |
-   awk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
-   feedGnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+   gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+   feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
 
 =head1 DESCRIPTION
 
@@ -595,23 +1102,31 @@ plots from data coming in on STDIN or given in a filename passed on the
 commandline. Various data representations are supported, as is hardcopy
 output and streaming display of live data. A simple example:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot
 
 You should see a plot with two curves. The C<awk> command generates some data to
-plot and the C<feedGnuplot> reads it in from STDIN and generates the plot. The
+plot and the C<feedgnuplot> reads it in from STDIN and generates the plot. The
 C<awk> invocation is just an example; more interesting things would be plotted
 in normal usage. No commandline-options are required for the most basic
 plotting. Input parsing is flexible; every line need not have the same number of
 points. New curves will be created as needed.
 
 The most commonly used functionality of gnuplot is supported directly by the
-script. Anything not directly supported can still be done with the
-C<--extracmds> and C<--curvestyle> options. Arbitrary gnuplot commands can be
-passed in with C<--extracmds>. For example, to turn off the grid, pass in
-C<--extracmds 'unset grid'>. As many of these options as needed can be passed
-in. To add arbitrary curve styles, use C<--curvestyle curveID=extrastyle>. Pass
-these more than once to affect more than one curve. To apply an extra style to
-I<all> the curves, pass in C<--curvestyleall extrastyle>.
+script. Anything not directly supported can still be done with options such as
+C<--set>, C<--extracmds> C<--style>, etc. Arbitrary gnuplot commands can be
+passed in with C<--extracmds>. For example, to turn off the grid, you can pass
+in C<--extracmds 'unset grid'>. Commands C<--set> and C<--unset> exists to
+provide nicer syntax, so this is equivalent to passing C<--unset grid>. As many
+of these options as needed can be passed in. To add arbitrary curve styles, use
+C<--style curveID extrastyle>. Pass these more than once to affect more than one
+curve.
+
+To apply an extra style to I<all> the curves that lack an explicit C<--style>,
+pass in C<--styleall extrastyle>. In the most common case, the extra style is
+C<with something>. To support this more simply, you can pass in C<--with
+something> instead of C<--styleall 'with something'>. C<--styleall> and
+C<--with> are mutually exclusive. Furthermore any curve-specific C<--style>
+overrides the global C<--styleall> or C<--with> setting.
 
 =head2 Data formats
 
@@ -627,9 +1142,9 @@ interpreted as the I<X>-value for the rest of the data on that line. Without
 C<--domain> the I<X>-value is the line number, and the first value on a line is
 a plain data point like the others. Default is C<--nodomain>. Thus the original
 example above produces 2 curves, with B<1,2,3,4,5> as the I<X>-values. If we run
-the same command with --domain:
+the same command with C<--domain>:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --domain
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --domain
 
 we get only 1 curve, with B<2,4,6,8,10> as the I<X>-values. As many points as
 desired can appear on a single line, but all points on a line are associated
@@ -642,7 +1157,7 @@ data is to be plotted. With the C<--dataid> option, each point is represented by
 2 values: a string identifying the curve, and the value itself. If we add
 C<--dataid> to the original example:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --dataid --autolegend
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --dataid --autolegend
 
 we get 5 different curves with one point in each. The first column, as produced
 by C<awk>, is B<2,4,6,8,10>. These are interpreted as the IDs of the curves to
@@ -654,18 +1169,24 @@ conjunction with C<--dataid>.
 =head3 Multi-value style support
 
 Depending on how gnuplot is plotting the data, more than one value may be needed
-to represent a single point. For example, the script has support to plot all the
-data with C<--circles>. This requires a radius to be specified for each point in
-addition to the position of the point. Thus, when plotting with C<--circles>, 2
-numbers are read for each data point instead of 1. A similar situation exists
-with C<--colormap> where each point contains the position I<and> the
-color. There are other gnuplot styles that require more data (such as error
-bars), but none of these are directly supported by the script. They can still be
-used, though, by specifying the specific style with C<--curvestyle>, and
-specifying how many extra values are needed for each point with
-C<--extraValuesPerPoint extra>. C<--extraValuesPerPoint> is ONLY needed for the
-styles not explicitly supported; supported styles set that variable
-automatically.
+to represent the range of a single point. Basic 2D plots have 2 numbers
+representing each point: 1 domain and 1 range. But if plotting with
+C<--circles>, for instance, then there's an extra range value: the radius. A
+similar situation exists with C<--colormap> where each point contains the
+position I<and> the color. There are other gnuplot styles that require more data
+(such as error bars), but none of these are directly supported by the script.
+They can still be used, however, by specifying the specific style with
+C<--style>, and specifying how many values are needed for each point with
+C<--rangesizeall> or C<--rangesize> or C<--extraValuesPerPoint>. Those options
+that specify the range size are required I<only> for styles not explicitly
+supported by feedgnuplot; supported styles do the right thing automatically.
+
+More examples: if making a 2d plot of y error bars where gnuplot expects a
+(x,y,ydelta) tuple for each point, you want C<--rangesizeall 2> because you have
+one domain value (x) and 2 range values (y,ydelta). Gnuplot can also plot
+lopsided y errorbars by giving a tuple (x,y,ylow,yhigh). This is similar as
+before, but you want C<--rangesizeall 3> instead.
+
 
 =head3 3D data
 
@@ -676,21 +1197,96 @@ instead of I<Y> as a function of I<X>). Thus the first 2 values on each line are
 interpreted as the domain instead of just 1. The rest of the processing happens
 the same way as before.
 
+=head3 Time/date data
+
+If the input data domain is a time/date, this can be interpreted with
+C<--timefmt>. This option takes a single argument: the format to use to parse
+the data. The format is documented in 'set timefmt' in gnuplot, although the
+common flags that C<strftime> understands are generally supported. The backslash
+sequences in the format are I<not> supported, so if you want a tab, put in a tab
+instead of \t. Whitespace in the format I<is> supported. When this flag is
+given, some other options act a little bit differently:
+
+=over
+
+=item
+
+C<--xlen> is an I<integer> in seconds
+
+=item
+
+C<--xmin> and C<--xmax> I<must> use the format passed in to C<--timefmt>
+
+=back
+
+Using this option changes both the way the input is parsed I<and> the way the
+x-axis tics are labelled. Gnuplot tries to be intelligent in this labelling, but
+it doesn't always do what the user wants. The labelling can be controlled with
+the gnuplot C<set format> command, which takes the same type of format string as
+C<--timefmt>. Example:
+
+ $ sar 1 -1 |
+   awk '$1 ~ /..:..:../ && $8 ~/^[0-9\.]*$/ {print $1,$8; fflush()}' |
+   feedgnuplot --stream --domain
+                --lines --timefmt '%H:%M:%S'
+                --set 'format x "%H:%M:%S"'
+
+This plots the 'idle' CPU consumption against time.
+
+Note that while gnuplot supports the time/date on any axis, I<feedgnuplot>
+currently supports it I<only> as the x-axis domain. This may change in the
+future.
+
 =head2 Real-time streaming data
 
-To plot real-time data, pass in the C<--stream> option. Data will then be
-plotted as it is received, with the refresh rate limited to 1Hz (currently
-hard-coded). To plot only the most recent data (instead of I<all> the data),
-C<--xlen windowsize> can be given. This will create an constantly-updating,
-scrolling view of the recent past. C<windowsize> should be replaced by the
-desired length of the domain window to plot, in domain units (passed-in values
-if C<--domain> or line numbers otherwise).
+To plot real-time data, pass in the C<--stream [refreshperiod]> option. Data
+will then be plotted as it is received. The plot will be updated every
+C<refreshperiod> seconds. If the period isn't specified, a 1Hz refresh rate is
+used. To refresh at specific intervals indicated by the data, set the
+refreshperiod to 0 or to 'trigger'. The plot will then I<only> be refreshed when
+a data line 'replot' is received. This 'replot' command works in both triggered
+and timed modes, but in triggered mode, it's the only way to replot. Look in
+L</"Special data commands"> for more information.
+
+To plot only the most recent data (instead of I<all> the data), C<--xlen
+windowsize> can be given. This will create an constantly-updating, scrolling
+view of the recent past. C<windowsize> should be replaced by the desired length
+of the domain window to plot, in domain units (passed-in values if C<--domain>
+or line numbers otherwise). If the domain is a time/date via C<--timefmt>, then
+C<windowsize> is and I<integer> in seconds.
+
+=head3 Special data commands
+
+If we are reading streaming data, the input stream can contain special commands
+in addition to the raw data. Feedgnuplot looks for these at the start of every
+input line. If a command is detected, the rest of the line is discarded. These
+commands are
+
+=over
+
+=item C<replot>
+
+This command refreshes the plot right now, instead of waiting for the next
+refresh time indicated by the timer. This command works in addition to the timed
+refresh, as indicated by C<--stream [refreshperiod]>.
+
+=item C<clear>
+
+This command clears out the current data in the plot. The plotting process
+continues, however, to any data following the C<clear>.
+
+=item C<exit>
+
+This command causes feedgnuplot to exit.
+
+=back
 
 =head2 Hardcopy output
 
 The script is able to produce hardcopy output with C<--hardcopy outputfile>. The
-output type is inferred from the filename with B<.ps>, B<.eps>, B<.pdf> and
-B<.png> currently supported.
+output type can be inferred from the filename, if B<.ps>, B<.eps>, B<.pdf>,
+B<.svg> or B<.png> is requested. If any other file type is requested,
+C<--terminal> I<must> be passed in to tell gnuplot how to make the plot.
 
 =head2 Self-plotting data files
 
@@ -702,7 +1298,7 @@ doing this: with a shebang (#!) or with inline perl data.
 A self-plotting, executable data file C<data> is formatted as
 
  $ cat data
- #!/usr/bin/feedGnuplot --lines --points
+ #!/usr/bin/feedgnuplot --lines --points
  2 1
  4 4
  6 9
@@ -724,10 +1320,10 @@ data file can be plotted simply with
 
  $ ./data
 
-The caveats here are that on Linux the whole #! line is limited to 127 charaters
-and that the full path to feedGnuplot must be given. The 127 character limit is
-a serious limitation, but this can likely be resolved with a kernel patch. I
-have only tried on Linux 2.6.
+The caveats here are that on Linux the whole #! line is limited to 127
+characters and that the full path to feedgnuplot must be given. The 127
+character limit is a serious limitation, but this can likely be resolved with a
+kernel patch. I have only tried on Linux 2.6.
 
 =head3 Self-plotting data with perl inline data
 
@@ -739,7 +1335,7 @@ create self-plotting files:
  use strict;
  use warnings;
 
- open PLOT, "| feedGnuplot --lines --points" or die "Couldn't open plotting pipe";
+ open PLOT, "| feedgnuplot --lines --points" or die "Couldn't open plotting pipe";
  while( <DATA> )
  {
    my @xy = split;
@@ -763,127 +1359,407 @@ create self-plotting files:
  30 225
 
 This is especially useful if the logged data is not in a format directly
-supported by feedGnuplot. Raw data can be stored after the __DATA__ directive,
+supported by feedgnuplot. Raw data can be stored after the __DATA__ directive,
 with a small perl script to manipulate the data into a useable format and send
 it to the plotter.
 
 =head1 ARGUMENTS
 
-  --[no]domain         If enabled, the first element of each line is the
-                       domain variable.  If not, the point index is used
+=over
+
+=item
+
+--C<[no]domain>
+
+If enabled, the first element of each line is the domain variable. If not, the
+point index is used
+
+=item
+
+--C<[no]dataid>
+
+If enabled, each data point is preceded by the ID of the data set that point
+corresponds to. This ID is interpreted as a string, NOT as just a number. If not
+enabled, the order of the point is used.
+
+As an example, if line 3 of the input is "0 9 1 20" then
+
+=over
+
+=item
+
+C<--nodomain --nodataid> would parse the 4 numbers as points in 4 different
+curves at x=3
+
+=item
+
+C<--domain --nodataid> would parse the 4 numbers as points in 3 different
+curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
+
+=item
+
+C<--nodomain --dataid> would parse the 4 numbers as points in 2 different
+curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
+data values
+
+=item
+
+C<--domain --dataid> would parse the 4 numbers as a single point at
+x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
+value, so it is ignored. If another value followed 20, we'd get another
+point in curve ID 20
+
+=back
+
+=item
+
+C<--[no]3d>
+
+Do [not] plot in 3D. This only makes sense with C<--domain>. Each domain here is
+an (x,y) tuple
+
+=item
+
+--C<timefmt [format]>
+
+Interpret the X data as a time/date, parsed with the given format
+
+=item
+
+C<--colormap>
+
+Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be
+used to set the extents of the colors. Automatically sets the C<--rangesize>.
+
+=item
+
+C<--stream [period]>
+
+Plot the data as it comes in, in realtime. If period is given, replot every
+period seconds. If no period is given, replot at 1Hz. If the period is given as
+0 or 'trigger', replot I<only> when the incoming data dictates this. See the
+L</"Real-time streaming data"> section of the man page.
+
+=item
+
+C<--[no]lines>
+
+Do [not] draw lines to connect consecutive points
+
+=item
+
+C<--[no]points>
 
-  --[no]dataid         If enabled, each data point is preceded by the ID
-                       of the data set that point corresponds to. This ID is
-                       interpreted as a string, NOT as just a number. If not
-                       enabled, the order of the point is used.
+Do [not] draw points
 
-As an example, if line 3 of the input is "0 9 1 20"
- '--nodomain --nodataid' would parse the 4 numbers as points in 4
-   different curves at x=3
-
- '--domain --nodataid' would parse the 4 numbers as points in 3 different
-   curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
-
- '--nodomain --dataid' would parse the 4 numbers as points in 2 different
-   curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
-   data values
+=item
 
- '--domain --dataid' would parse the 4 numbers as a single point at
-   x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
-   value, so it is ignored. If another value followed 20, we'd get another
-   point in curve ID 20
+C<--circles>
 
-  --[no]3d             Do [not] plot in 3D. This only makes sense with --domain.
-                       Each domain here is an (x,y) tuple
+Plot with circles. This requires a radius be specified for each point.
+Automatically sets the C<--rangesize>. C<Not> supported for 3d plots.
 
-  --colormap           Show a colormapped xy plot. Requires extra data for the color.
-                       zmin/zmax can be used to set the extents of the colors.
-                       Automatically increments extraValuesPerPoint
+=item
 
-  --[no]stream         Do [not] display the data a point at a time, as it
-                       comes in
+C<--title xxx>
 
-  --[no]lines          Do [not] draw lines to connect consecutive points
-  --[no]points         Do [not] draw points
-  --circles            Plot with circles. This requires a radius be specified for
-                       each point. Automatically increments extraValuesPerPoint
+Set the title of the plot
 
-  --xlabel xxx         Set x-axis label
-  --ylabel xxx         Set y-axis label
-  --y2label xxx        Set y2-axis label. Does not apply to 3d plots
-  --zlabel xxx         Set y-axis label. Only applies to 3d plots
+=item
 
-  --title  xxx         Set the title of the plot
+C<--legend curveID legend>
 
-  --legend curveID=legend
-                       Set the label for a curve plot. Use this option multiple times
-                       for multiple curves. With --dataid, curveID is the ID. Otherwise,
-                       it's the index of the curve, starting at 0
+Set the label for a curve plot. Use this option multiple times for multiple
+curves. With C<--dataid>, curveID is the ID. Otherwise, it's the index of the
+curve, starting at 0
 
-  --autolegend         Use the curve IDs for the legend. Titles given with --legend
-                       override these
+=item
 
-  --xlen xxx           When using --stream, sets the size of the x-window to plot.
-                       Omit this or set it to 0 to plot ALL the data. Does not
-                       make sense with 3d plots. Implies --monotonic
+C<--autolegend>
 
-  --xmin  xxx          Set the range for the x axis. These are ignored in a
-                       streaming plot
-  --xmax  xxx          Set the range for the x axis. These are ignored in a
-                       streaming plot
-  --ymin  xxx          Set the range for the y axis.
-  --ymax  xxx          Set the range for the y axis.
-  --y2min xxx          Set the range for the y2 axis. Does not apply to 3d plots.
-  --y2max xxx          Set the range for the y2 axis. Does not apply to 3d plots.
-  --zmin  xxx          Set the range for the z axis. Only applies to 3d plots or colormaps.
-  --zmax  xxx          Set the range for the z axis. Only applies to 3d plots or colormaps.
+Use the curve IDs for the legend. Titles given with C<--legend> override these
 
-  --y2    xxx          Plot the data specified by this curve ID on the y2 axis.
-                       Without --dataid, the ID is just an ordered 0-based index.
-                       Does not apply to 3d plots.
+=item
 
-  --curvestyle curveID=style
-                       Additional styles per curve. With --dataid, curveID is the
-                       ID. Otherwise, it's the index of the curve, starting at 0. Use
-                       this option multiple times for multiple curves
+C<--xlen xxx>
 
-  --curvestyleall xxx  Additional styles for ALL curves.
+When using C<--stream>, sets the size of the x-window to plot. Omit this or set
+it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies
+C<--monotonic>
 
-  --extracmds xxx      Additional commands. These could contain extra global styles
-                       for instance
+=item
 
-  --size  xxx          Gnuplot size option
+C<--xmin/xmax/ymin/ymax/y2min/y2max/zmin/zmax xxx>
 
-  --square             Plot data with aspect ratio 1. For 3D plots, this controls the
-                       aspect ratio for all 3 axes
+Set the range for the given axis. These x-axis bounds are ignored in a streaming
+plot. The y2-axis bound do not apply in 3d plots. The z-axis bounds apply
+I<only> to 3d plots or colormaps.
 
-  --square_xy          For 3D plots, set square aspect ratio for ONLY the x,y axes
+=item
 
-  --hardcopy xxx       If not streaming, output to a file specified here. Format
-                       inferred from filename
+C<--xlabel/ylabel/y2label/zlabel xxx>
 
-  --maxcurves xxx      The maximum allowed number of curves. This is 100 by default,
-                       but can be reset with this option. This exists purely to
-                       prevent perl from allocating all of the system's memory when
-                       reading bogus data
+Label the given axis. The y2-axis label does not apply to 3d plots while the
+z-axis label applies I<only> to 3d plots.
 
-  --monotonic          If --domain is given, checks to make sure that the x-
-                       coordinate in the input data is monotonically increasing.
-                       If a given x-variable is in the past, all data currently
-                       cached for this curve is purged. Without --monotonic, all
-                       data is kept. Does not make sense with 3d plots.
-                       No --monotonic by default.
-
-  --extraValuesPerPoint xxx
-                       How many extra values are given for each data point. Normally this
-                       is 0, and does not need to be specified, but sometimes we want
-                       extra data, like for colors or point sizes or error bars, etc.
-                       feedGnuplot options that require this (colormap, circles)
-                       automatically set it. This option is ONLY needed if unknown styles are
-                       used, with --curvestyleall for instance
-
-  --dump               Instead of printing to gnuplot, print to STDOUT. For
-                       debugging.
+=item
+
+C<--y2 xxx>
+
+Plot the data specified by this curve ID on the y2 axis. Without C<--dataid>,
+the ID is just an ordered 0-based index. Does not apply to 3d plots. Can be
+passed multiple times, or passed a comma-separated list. By default the y2-axis
+curves look the same as the y-axis ones. I.e. the viewer of the resulting plot
+has to be told which is which via an axes label, legend, etc. Prior to version
+1.25 of feedgnuplot the curves plotted on the y2 axis were drawn with a thicker
+line. This is no longer the case, but that behavior can be brought back by
+passing something like
+
+ --y2 curveid --style curveid 'linewidth 3'
+
+=item
+
+C<--histogram curveID>
+
+
+Set up a this specific curve to plot a histogram. The bin width is given with
+the C<--binwidth> option (assumed 1.0 if omitted). C<--histogram> does I<not>
+touch the drawing style. It is often desired to plot these with boxes, and this
+I<must> be explicitly requested by C<--with boxes>. This works with C<--domain>
+and/or C<--stream>, but in those cases the x-value is used I<only> to cull old
+data because of C<--xlen> or C<--monotonic>. I.e. the x-values are I<not> drawn
+in any way. Can be passed multiple times, or passed a comma- separated list
+
+=item
+
+C<--binwidth width>
+
+The width of bins when making histograms. This setting applies to ALL histograms
+in the plot. Defaults to 1.0 if not given.
+
+=item
+
+C<--histstyle style>
+
+Normally, histograms are generated with the 'smooth freq' gnuplot style.
+C<--histstyle> can be used to select different 'smooth' settings. Allowed are
+'unique', 'cumulative' and 'cnormal'. 'unique' indicates whether a bin has at
+least one item in it: instead of counting the items, it'll always report 0 or 1.
+'cumulative' is the integral of the "normal" histogram. 'cnormal' is like
+'cumulative', but rescaled to end up at 1.0.
+
+=item
+
+C<--style curveID style>
+
+Additional styles per curve. With C<--dataid>, curveID is the ID. Otherwise,
+it's the index of the curve, starting at 0. Use this option multiple times for
+multiple curves. C<--styleall> does I<not> apply to curves that have a
+C<--style>
+
+=item
+
+C<--curvestyle curveID>
+
+Synonym for C<--style>
+
+=item
+
+C<--styleall xxx>
+
+Additional styles for all curves that have no C<--style>. This is overridden by
+any applicable C<--style>. Exclusive with C<--with>.
+
+=item
+
+C<--curvestyleall xxx>
+
+Synonym for C<--styleall>
+
+=item
+
+C<--with xxx>
+
+Same as C<--styleall>, but prefixed with "with". Thus
+
+ --with boxes
+
+is equivalent to
+
+ --styleall 'with boxes'
+
+Exclusive with C<--styleall>.
+
+=item
+
+C<--extracmds xxx>
+
+Additional commands to pass on to gnuplot verbatim. These could contain extra
+global styles for instance. Can be passed multiple times.
+
+=item
+
+C<--set xxx>
+
+Additional 'set' commands to pass on to gnuplot verbatim. C<--set 'a b c'> will
+result in gnuplot seeing a C<set a b c> command. Can be passed multiple times.
+
+=item
+
+C<--unset xxx>
+
+Additional 'unset' commands to pass on to gnuplot verbatim. C<--unset 'a b c'>
+will result in gnuplot seeing a C<unset a b c> command. Can be passed multiple
+times.
+
+=item
+
+C<--square>
+
+Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for
+all 3 axes
+
+=item
+
+C<--square_xy>
+
+For 3D plots, set square aspect ratio for ONLY the x,y axes
+
+=item
+
+C<--hardcopy xxx>
+
+If not streaming, output to a file specified here. Format inferred from
+filename, unless specified by C<--terminal>
+
+=item
+
+C<--terminal xxx>
+
+String passed to 'set terminal'. No attempts are made to validate this.
+C<--hardcopy> sets this to some sensible defaults if --hardcopy is given .png,
+.pdf, .ps, .eps or .svg. If any other file type is desired, use both
+C<--hardcopy> and C<--terminal>
+
+=item
+
+C<--maxcurves xxx>
+
+The maximum allowed number of curves. This is 100 by default, but can be reset
+with this option. This exists purely to prevent perl from allocating all of the
+system's memory when reading bogus data
+
+=item
+
+C<--monotonic>
+
+If C<--domain> is given, checks to make sure that the x- coordinate in the input
+data is monotonically increasing. If a given x-variable is in the past, all data
+currently cached for this curve is purged. Without C<--monotonic>, all data is
+kept. Does not make sense with 3d plots. No C<--monotonic> by default. The data is
+replotted before being purged
+
+=item
+
+C<--rangesize curveID xxx>
+
+The options C<--rangesizeall>, C<--rangesize> and C<--extraValuesPerPoint> set
+the number of values are needed to represent each point being plotted (see
+L</"Multi-value style support"> above). These options are I<only> needed if
+unknown styles are used, with C<--styleall> or C<--with> for instance.
+
+C<--rangesize> is used to set how many values are needed to represent the range
+of a point for a particular curve. This overrides any defaults that may exist
+for this curve only.
+
+=item
+
+C<--rangesizeall xxx>
+
+Like C<--rangesize>, but applies to I<all> the curves.
+
+C<--extraValuesPerPoint xxx>
+
+Like C<--rangesizeall>, but instead of overriding the default, adds to it. For
+example, if plotting non-lopsided y errorbars gnuplot wants (x,y,ydelta) tuples.
+These can be specified both with C<--rangesizeall 2> (because there are 2 range
+values) or C<--extraValuesPerPoint 1> (because there's 1 more value than usual).
+
+This option is I<only> needed if unknown styles are used, with C<--styleall> or
+C<--with> for instance.
+
+=item
+
+C<--dump>
+
+Instead of printing to gnuplot, print to STDOUT. Very useful for debugging. It
+is possible to send the output produced this way to gnuplot directly.
+
+=item
+
+C<--exit>
+
+Terminate the feedgnuplot process after passing data to gnuplot. The window will
+persist but will not be interactive. Without this option feedgnuplot keeps
+running and must be killed by the user. Note that this option works only with
+later versions of gnuplot and only with some gnuplot terminals.
+
+=item
+
+C<--geometry>
+
+If using X11, specifies the size, position of the plot window
+
+=item
+
+C<--version>
+
+Print the version and exit
+
+=back
+
+=head1 RECIPES
+
+=head2 Basic plotting of piped data
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}'
+ 2 1
+ 4 4
+ 6 9
+ 8 16
+ 10 25
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}' |
+   feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+
+=head2 Realtime plot of network throughput
+
+Looks at wlan0 on Linux.
+
+ $ while true; do sleep 1; cat /proc/net/dev; done |
+   gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+   feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+
+=head2 Realtime plot of battery charge in respect to time
+
+Uses the result of the C<acpi> command.
+
+ $ while true; do acpi; sleep 15; done |
+   perl -nE 'BEGIN{ $| = 1; } /([0-9]*)%/; say join(" ", time(), $1);' |
+   feedgnuplot --stream --ymin 0 --ymax 100 --lines --domain --xlabel 'Time' --timefmt '%s' --ylabel "Battery charge (%)"
+
+=head2 Realtime plot of temperatures in an IBM Thinkpad
+
+Uses C</proc/acpi/ibm/thermal>, which reports temperatures at various locations
+in a Thinkpad.
+
+ $ while true; do cat /proc/acpi/ibm/thermal | awk '{$1=""; print}' ; sleep 1; done |
+   feedgnuplot --stream --xlen 100 --lines --autolegend --ymax 100 --ymin 20 --ylabel 'Temperature (deg C)'
+
+=head2 Plotting a histogram of file sizes in a directory
+
+ $ ls -l | awk '{print $5/1e6}' |
+   feedgnuplot --histogram 0 --with boxes --ymin 0 --xlabel 'File size (MB)' --ylabel Frequency
 
 =head1 ACKNOWLEDGEMENT
 
@@ -897,11 +1773,11 @@ L<https://github.com/dkogan/feedgnuplot>
 
 =head1 AUTHOR
 
-Dima Kogan, C<< <dkogan at cds.caltech.edu> >>
+Dima Kogan, C<< <dima at secretsauce.net> >>
 
 =head1 LICENSE AND COPYRIGHT
 
-Copyright 2011 Dima Kogan.
+Copyright 2011-2012 Dima Kogan.
 
 This program is free software; you can redistribute it and/or modify it
 under the terms of either: the GNU General Public License as published
@@ -910,3 +1786,4 @@ by the Free Software Foundation; or the Artistic License.
 See http://dev.perl.org/licenses/ for more information.
 
 =cut
+
diff --git a/perl/gas.pm b/perl/gas.pm
deleted file mode 100644
index 106bee3..0000000
--- a/perl/gas.pm
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/perl 
-
-package as;
-use Data::Dumper;
-use isax86;
-use isax86_64;
-
-$AS = { HEADER     => '.intel_syntax noprefix',
-	    FOOTER     => ''};
-
-$LOCAL = {};
-$MODE = 'GLOBAL';
-
-my $CURRENT_SECTION='NONE';
-my $WORDLENGTH;
-my $STACKPTR;
-my $BASEPTR;
-my $REG;
-my $ARG;
-
-sub emit_code
-{
-	my $code = shift;
-	$code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
-	$code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
-	$code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
-	print "$code\n";
-}
-
-sub align
-{
-	my $number = shift;
-	print ".align $number\n";
-
-}
-
-sub mode
-{
-	$cmd = shift;
-
-	if ($cmd eq 'START') {
-		$MODE = 'LOCAL';
-	} elsif ($cmd eq 'STOP') {
-		$MODE = 'GLOBAL';
-	}
-}
-
-sub function_entry
-{
-	my $symbolname = shift;
-	my $allocate = shift;
-	my $distance;
-
-	foreach ( (0 .. $allocate) ) {
-		$distance =  $_ * $WORDLENGTH;
-		$LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
-	}
-
-	if($CURRENT_SECTION ne 'text') {
-		$CURRENT_SECTION = 'text';
-		print ".text\n";
-	}
-
-	print ".globl $symbolname\n";
-	print ".type $symbolname, \@function\n";
-	print "$symbolname :\n";
-
-	if ($main::ISA eq 'x86') {
-		print "push ebp\n";
-		print "mov ebp, esp\n";
-		$distance = $allocate * $WORDLENGTH;
-		print "sub  esp, $distance\n" if ($allocate);
-		print "push ebx\n";
-		print "push esi\n";
-		print "push edi\n";
-	} elsif ($main::ISA eq 'x86-64') {
-		print "push rbp\n";
-		print "mov rbp, rsp\n";
-		$distance = $allocate * $WORDLENGTH;
-		print "sub  rsp, $distance\n" if ($allocate);
-		print "push rbx\n";
-		print "push r12\n";
-		print "push r13\n";
-		print "push r14\n";
-		print "push r15\n";
-	}
-}
-
-sub function_exit
-{
-	my $symbolname = shift;
-
-	$LOCAL = {};
-
-	if ($main::ISA eq 'x86') {
-		print "pop edi\n";
-		print "pop esi\n";
-		print "pop ebx\n";
-		print "mov  esp, ebp\n";
-		print "pop ebp\n";
-	} elsif ($main::ISA eq 'x86-64') {
-		print "pop r15\n";
-		print "pop r14\n";
-		print "pop r13\n";
-		print "pop r12\n";
-		print "pop rbx\n";
-		print "mov  rsp, rbp\n";
-		print "pop rbp\n";
-	}
-	print "ret\n";
-	print ".size $symbolname, .-$symbolname\n";
-	print "\n";
-}
-
-sub define_data
-{
-	my $symbolname = shift;
-	my $type = shift;
-	my $value = shift;
-
-	if($CURRENT_SECTION ne 'data') {
-		$CURRENT_SECTION = 'data';
-		print ".data\n";
-	}
-	print ".align 64\n";
-	print "$symbolname:\n";
-	if ($type eq 'DOUBLE') {
-		print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
-	} elsif ($type eq 'SINGLE') {
-		print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
-	} elsif ($type eq 'INT') {
-		print ".int $value, $value\n"
-	}
-}
-
-sub define_offset
-{
-	my $symbolname = shift;
-	my $type = shift;
-	my $value = shift;
-
-	if($CURRENT_SECTION ne 'data') {
-		$CURRENT_SECTION = 'data';
-		print ".data\n";
-	}
-	print ".align 16\n";
-	print "$symbolname:\n";
-  print ".int $value\n";
-}
-
-
-sub loop_entry
-{
-  my $symbolname = shift;
-  my $stopping_criterion = shift;
-  $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
-
-  if ($main::ISA eq 'x86') {
-    print "xor   eax, eax\n";
-  } elsif ($main::ISA eq 'x86-64') {
-    print "xor   rax, rax\n";
-  }
-  print ".align 16\n";
-  if ($MODE eq 'GLOBAL') {
-    print "$symbolname :\n";
-  }else {
-    print "1:\n";
-  }
-
-}
-
-
-sub loop_exit
-{
-  my $symbolname = shift;
-  my $step = shift;
-
-  if ($main::ISA eq 'x86') {
-    print "add eax, $step\n";
-    print "cmp eax, edi\n";
-  } elsif ($main::ISA eq 'x86-64') {
-    print "add rax, $step\n";
-    print "cmp rax, rdi\n";
-  }
-  if ($MODE eq 'GLOBAL') {
-    print "jl $symbolname\n";
-  }else {
-    print "jl 1b\n";
-  }
-  print "\n";
-}
-
-sub isa_init
-{
-  if ($main::ISA eq 'x86') {
-    $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
-    $STACKPTR = $isax86::STACKPTR_X86 ;
-    $BASEPTR = $isax86::BASEPTR_X86 ;
-    $REG = $isax86::REG_X86;
-    $ARG = $isax86::ARG_X86 ;
-  } elsif ($main::ISA eq 'x86-64') {
-    $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
-    $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
-    $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
-    $REG = $isax86_64::REG_X86_64;
-    $ARG = $isax86_64::ARG_X86_64 ;
-  }
-}
-
-
-1;
diff --git a/perl/gen_events.pl b/perl/gen_events.pl
index f5736ad..4833ccc 100755
--- a/perl/gen_events.pl
+++ b/perl/gen_events.pl
@@ -5,11 +5,16 @@ use warnings;
 
 my $arch;
 my $key;
+my $optkey = "";
 my $eventId;
+my $eventname;
 my $limit;
 my $umask;
 my $cmask;
 my $cfg;
+my $opts = "";
+my $defoptkey = "";
+my $defopts = "";
 my $num_events=0;
 my @events = ();
 
@@ -33,31 +38,94 @@ while (<INFILE>) {
     if (/^#/) {
         # Skip comment
     }elsif (/(EVENT_[A-Z0-9_]*)[ ]+(0x[A-F0-9]+)[ ]+([A-Z0-9|]+)/) {
+        $eventname = $1;
         $eventId = $2;
         $limit = $3;
+        $opts = "EVENT_OPTION_NONE_MASK";
     } elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)/) {
         $key   = $1;
         $umask = $2;
         $cfg   = $3;
         $cmask = $4;
+        my $defaultopts = "{";
+        my $nropts = 0;
+        if ($key ne $optkey or $optkey eq "")
+        {
+            $opts = "EVENT_OPTION_NONE_MASK";
+        }
+        if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+        {
+            my @optlist = split(",", $defopts);
+            foreach my $opt (@optlist)
+            {
+                my @tmplist = split("=", $opt);
+                $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+                $nropts++;
+            }
+        }
+        if (length($defaultopts) > 1)
+        {
+            substr($defaultopts,length($defaultopts)-1,1) = '}';
+        }
+        else
+        {
+            $defaultopts = $defaultopts."}";
+        }
         push(@events,{name=>$key,
                 limit=>$limit,
                 eventId=>$eventId,
                 cfg=>$cfg,
                 cmask=>$cmask,
-                mask=>$umask});
+                mask=>$umask,
+                nropts=>$nropts,
+                opts=>$opts,
+                defopts=>$defaultopts});
         $num_events++;
     } elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)/) {
         $key = $1;
         $umask = $2;
+        my $defaultopts = "{";
+        my $nropts = 0;
+        if ($key ne $optkey or $optkey eq "")
+        {
+            $opts = "EVENT_OPTION_NONE_MASK"
+        }
+        if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+        {
+            my @optlist = split(",", $defopts);
+            foreach my $opt (@optlist)
+            {
+                my @tmplist = split("=", $opt);
+                $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+                $nropts++;
+            }
+        }
+        if (length($defaultopts) > 1)
+        {
+            substr($defaultopts,length($defaultopts)-1,1) = '}';
+        }
+        else
+        {
+            $defaultopts = $defaultopts."}";
+        }
         push(@events,{name=>$key,
                 limit=>$limit,
                 eventId=>$eventId,
                 cfg=>0x00,
                 cmask=>0x00,
-                mask=>$umask});
+                mask=>$umask,
+                nropts=>$nropts,
+                opts=>$opts,
+                defopts=>$defaultopts});
         $num_events++;
     }
+    elsif (/DEFAULT_OPTIONS_([A-Z0-9_]*)[ ]*([xA-Z0-9_=,]*)/) {
+        $defoptkey = $1;
+        $defopts = $2;
+    } elsif (/OPTIONS_([A-Z0-9_]*)[ ]*([A-Z0-9_\|]+)/) {
+        $optkey = $1;
+        $opts = $2;
+    }
 }
 close INFILE;
 
@@ -72,11 +140,8 @@ print OUTFILE "#define NUM_ARCH_EVENTS_$ucArch $num_events\n\n";
 print OUTFILE "static PerfmonEvent  ".$arch."_arch_events[NUM_ARCH_EVENTS_$ucArch] = {\n";
 
 foreach my $event (@events) {
-
     print OUTFILE <<END;
-$delim {\"$event->{name}\",
-   \"$event->{limit}\", 
-   $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask}}
+$delim {\"$event->{name}\", \"$event->{limit}\", $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask},$event->{nropts},$event->{opts},$event->{defopts}}
 END
     $delim = ',';
 }
diff --git a/perl/generateGroups.pl b/perl/generateGroups.pl
deleted file mode 100755
index bbfb9b9..0000000
--- a/perl/generateGroups.pl
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-use lib './perl';
-use File::Copy;
-use Cwd 'abs_path';
-use Data::Dumper;
-use Template;
-
-my $name;
-my $shortHelp;
-
-my %groupEnum;
-my $GroupRoot = $ARGV[0];
-my $OutputDirectory = $ARGV[1];
-my $TemplateRoot = $ARGV[2];
-my $DEBUG = 0;
-
-my $tpl = Template->new({
-        INCLUDE_PATH => ["$TemplateRoot"]
-        })|| die Template->error(), "\n";
-
-# First open the architecture directories
-opendir (DIR, "./$GroupRoot") or die "Cannot open groups directory: $!\n";
-my $rule;
-my $metric;
-
-while (defined(my $arch = readdir(DIR))) {
-    if ($arch !~ /^\./) {
-        print "SCANNING $arch\n" if ($DEBUG);
-        if (-d "$GroupRoot/$arch") {
-
-            my $Vars;
-            my @groups;
-            opendir (ARCHDIR, "$GroupRoot/$arch") or die "Cannot open current directory: $!\n";
-
-            while (defined(my $group = readdir(ARCHDIR))) {
-
-                next unless ($group !~ /^\./);
-                print "SCANNING GROUP $group\n" if ($DEBUG);
-                my $eventSet;
-                my @metrics;
-                my $isUncore = 0;
-                $Vars->{groups} = [];
-
-                $group =~ /([A-Za-z_0-9]+)\.txt/;
-                $name = $1;
-
-                open FILE, "<$GroupRoot/$arch/$group";
-
-                my $isInSet = 0;
-                my $isInMetrics = 0;
-                my $isInLong = 0;
-                my $msg = '';
-
-                while (<FILE>) {
-                    my $line = $_;
-
-                    if($line =~ /SHORT[ ]+(.+)/) {
-                        $shortHelp = $1;
-                    } elsif ($line =~ /EVENTSET/) {
-                        $isInSet = 1;
-                    } elsif ($line =~ /METRICS/) {
-                        $isInSet = 0;
-                        $isInMetrics = 1;
-                        $eventSet =~ s/,$//;
-                    } elsif ($line =~ /LONG/) {
-                        $isInSet = 0;
-                        $isInMetrics = 0;
-                        $isInLong = 1;
-                    } else {
-                        if ($isInSet) {
-                            if ($line =~ /([A-Z0-9]+)[ ]+([A-Z_0-9]+)/) {
-                                $eventSet .= "$2:$1,";
-                            }
-                        } elsif ($isInMetrics)  {
-                            if ($line =~ /(.+)[ ]+(.+)/) {
-                                $metric = $1;
-                                $rule = $2;
-                                $rule =~ s/(UPMC[0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/([^U]|^)(PMC[0-9]+)/$1perfmon_getResult(threadId,"$2")/g;
-                                $rule =~ s/(FIXC[0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(WBOX[0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(BBOX[C0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(MBOX[CC0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(SBOX[P0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(RBOX[C0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(PWR[0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(TMP[0-9]+)/perfmon_getResult(threadId,"$1")/g;
-                                $rule =~ s/(MBOXFIX)/perfmon_getResult(threadId,"$1")/g;
-
-                                $metric =~ s/(^\s+|\s+$)//g;
-                                push (@metrics, {label => $metric,
-                                        rule  => $rule});
-                            }
-                        } elsif ($isInLong) {
-                            $msg .= $line;
-                        }
-                    }
-                }
-                close FILE;
-                $msg =~ s/\n/\\n\\\n/g;
-
-                if ($eventSet =~ /WBOX|BBOX|MBOX|SBOX|RBOX/) {
-                    $isUncore = 1;
-                }
-
-                push (@groups, {name => $name,
-                        shortHelp => $shortHelp,
-                        longHelp  => $msg,
-                        isUncore  => $isUncore,
-                        eventSet  => $eventSet,
-                        numRows   => $#metrics+1,
-                        metrics   => \@metrics});
-
-                if (not exists($groupEnum{$name})) {
-                    $groupEnum{$name} = 1;
-                }
-
-            }
-
-            $Vars->{arch} = $arch;
-            my @groupsSorted = sort {$a->{name} cmp $b->{name}} @groups;
-            $Vars->{groups} = \@groupsSorted;
-            $Vars->{numGroups} = $#groupsSorted+1;
-
-
-            $tpl->process('group.tt', $Vars, "$OutputDirectory/perfmon_$arch"."_groups.h")|| die $tpl->error(), "\n";
-#            print Dumper($Vars);
-            closedir ARCHDIR;
-        }
-    }
-}
-closedir DIR;
-
-my $Vars;
-$Vars->{groups} = \%groupEnum;
-$tpl->process('group_types.tt', $Vars, "$OutputDirectory/perfmon_group_types.h")|| die $tpl->error(), "\n";
-
-
-
diff --git a/perl/generatePas.pl b/perl/generatePas.pl
deleted file mode 100755
index 9c1dcd1..0000000
--- a/perl/generatePas.pl
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/perl
-
-use lib 'util';
-use strict;
-use warnings;
-use lib './perl';
-use File::Copy;
-use Cwd 'abs_path';
-use Data::Dumper;
-use Template;
-
-my @Testcases;
-my $name;
-my $streams;
-my $type;
-my $flops;
-my $bytes;
-my $prolog='';
-my $loop='';
-my $increment;
-my $isLoop=0;
-my $skip=0;
-my $multi=0;
-
-my $BenchRoot = $ARGV[0];
-my $OutputDirectory = $ARGV[1];
-my $TemplateRoot = $ARGV[2];
-my $DEBUG = 0;
-
-my $stream_lookup = {
-    STR0 => 'ARG2',
-    STR1 => 'ARG3',
-    STR2 => 'ARG4',
-    STR3 => 'ARG5',
-    STR4 => 'ARG6',
-    STR5 =>  '[rbp+16]',
-    STR6 =>  '[rbp+24]',
-    STR7 =>  '[rbp+32]',
-    STR8 =>  '[rbp+40]',
-    STR9 => '[rbp+48]',
-    STR10 => '[rbp+56]',
-    STR11 => '[rbp+64]',
-    STR12 => '[rbp+72]',
-    STR13 => '[rbp+80]',
-    STR14 => '[rbp+88]',
-    STR15 => '[rbp+96]',
-    STR16 => '[rbp+104]',
-    STR17 => '[rbp+112]',
-    STR18 => '[rbp+120]',
-    STR19 => '[rbp+128]',
-    STR20 => '[rbp+136]',
-    STR21 => '[rbp+144]',
-    STR22 => '[rbp+152]',
-    STR23 => '[rbp+160]',
-    STR24 => '[rbp+168]',
-    STR25 => '[rbp+176]',
-    STR26 => '[rbp+184]',
-    STR27 => '[rbp+192]',
-    STR28 => '[rbp+200]',
-    STR29 => '[rbp+208]',
-    STR30 => '[rbp+216]',
-    STR31 => '[rbp+224]',
-    STR32 => '[rbp+232]',
-    STR33 => '[rbp+240]',
-    STR34 => '[rbp+248]',
-    STR35 => '[rbp+256]',
-    STR36 => '[rbp+264]',
-    STR37 => '[rbp+272]',
-    STR38 => '[rbp+280]',
-    STR39 => '[rbp+288]',
-    STR40 => '[rbp+296]'};
-
-opendir (DIR, "./$BenchRoot") or die "Cannot open bench directory: $!\n";
-my $tpl = Template->new({
-        INCLUDE_PATH => ["$TemplateRoot"]
-        });
-
-while (defined(my $file = readdir(DIR))) {
-    if ($file !~ /^\./) {
-        print "SCANNING $file\n" if ($DEBUG);
-
-        $file =~ /([A-Za-z_0-9]+)\.ptt/;
-        $name = $1;
-
-        $isLoop = 0;
-        $skip=0;
-        $multi=0;
-        $prolog='';
-        $loop='';
-        open FILE, "<$BenchRoot/$file";
-        while (<FILE>) {
-            my $line = $_;
-
-            if($line =~ /STREAMS[ ]+([0-9]+)/) {
-                $streams = $1;
-                if ($streams > 10) {
-                    $multi = 1;
-                }
-            } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE)/) {
-                $type = $1;
-            } elsif ($line =~ /FLOPS[ ]+([0-9.]+)/) {
-                $flops = $1;
-            } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
-                $bytes = $1;
-            } elsif ($line =~ /INC[ ]+([0-9]+)/) {
-                $increment = $1;
-                $skip = 1;
-            } elsif ($line =~ /LOOP[ ]+([0-9]+)/) {
-                $increment = $1;
-                $isLoop = 1;
-            } else {
-                if ($isLoop) {
-                    if($line =~ /SET[ ]+(STR[0-9]+)[ ]+(GPR[0-9]+)/) {
-                        $loop .= "#define $1  $2\n";
-                        $loop .= "mov $2, $stream_lookup->{$1}\n";
-                    } else {
-                        $loop .= $line;
-                    }
-                } else {
-                    $prolog .= $line;
-                }
-            }
-        }
-        close FILE;
-
-        if (($streams > 5) &&  ($streams < 10)) {
-            my $arg = 7;
-            foreach my $stream ( 5 .. $streams ) {
-                $prolog .= "mov STR$stream, ARG$arg\n";
-                $arg++;
-            }
-        }
-
-        $streams = 'STREAM_'.$streams;
-        my $Vars;
-        $Vars->{name} = $name;
-        $Vars->{prolog} = $prolog;
-        $Vars->{increment} = $increment;
-        $Vars->{loop} = $loop;
-        $Vars->{skip} = $skip;
-        $Vars->{multi} = $multi;
-
-#print Dumper($Vars);
-
-        $tpl->process('bench.tt', $Vars, "$OutputDirectory/$name.pas");
-        push(@Testcases,{name    => $name,
-                         streams => $streams,
-                         type    => $type,
-                         stride  => $increment,
-                         flops   => $flops, 
-                         bytes   => $bytes});
-    }
-}
-#print Dumper(@Testcases);
-my @TestcasesSorted = sort {$a->{name} cmp $b->{name}} @Testcases;
-
-my $Vars;
-$Vars->{Testcases} = \@TestcasesSorted;
-$Vars->{numKernels} = $#TestcasesSorted+1;
-$Vars->{allTests} = join('\n',map {$_->{name}} @TestcasesSorted);
-$tpl->process('testcases.tt', $Vars, "$OutputDirectory/testcases.h");
-
-
diff --git a/perl/likwid-mpirun b/perl/likwid-mpirun
deleted file mode 100755
index b922359..0000000
--- a/perl/likwid-mpirun
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-#      Filename:  likwid-mpirun
-#
-#      Description:  Wrapper application to mpi startup mechanisms. Builds on
-#                    likwid to control affinity and has integrated perfctr support.
-#
-#      Version:   <VERSION>
-#      Released:  <DATE>
-#
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
-#      Project:  likwid
-#
-#      Copyright (C) 2014 Jan Treibig
-#
-#      This program is free software: you can redistribute it and/or modify it under
-#      the terms of the GNU General Public License as published by the Free Software
-#      Foundation, either version 3 of the License, or (at your option) any later
-#      version.
-#
-#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
-#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-#
-#      You should have received a copy of the GNU General Public License along with
-#      this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Long;
-##############################
-#       CONFIGURATION        #
-##############################
-my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
-my $LIKWIDPERF = '<PREFIX>/bin/likwid-perfctr';
-my $MPIROOT_openmpi  =  $ENV{'MPIHOME'};
-my $MPIROOT_intelmpi =  $ENV{'MPIHOME'};
-my $MPIEXEC_openmpi  = "$MPIROOT_openmpi/bin/mpiexec";
-my $MPIEXEC_intelmpi = "$MPIROOT_intelmpi/bin/mpiexec";
-my $MPIEXEC_mvapich2 = "mpirun";
-##############################
-
-my $OMPType = '';
-my $MPIType = '';
-my $WrapperScript = "mpiexec.$$";
-my %Domains;
-my $NP = 0;
-my $PPN = 0;
-my $NperNode = 0;
-my %NodeList;
-my $NumberOfNodes = 0;
-my $NumberOfUsedNodes = 0;
-my $Hostfilename = 0;
-my $Hostfile = '';
-my $PerformanceGroup = '';
-my $LikwidCall = "$LIKWIDPIN -c ";
-my $debug = 0;
-my $marker = '';
-
-sub readHostfile
-{
-    open FILE, "<$Hostfilename";
-
-    while (<FILE>) {
-        chomp;
-        if (not exists $NodeList{$host}) {
-            $NodeList{$_} = 1;
-        }
-    }
-    close FILE;
-
-    $NumberOfNodes = keys %NodeList;
-}
-
-# MPI implementations
-# OpenMPI  #<# 
-sub generateNodelist_openmpi
-{
-    open FILE, ">$Hostfilename-openmpi";
-
-    #FIXME  Order may be different
-    foreach my $node (keys %NodeList) {
-        print FILE "$node slots=$PPN\n"
-    }
-
-    close FILE;
-
-    $Hostfile = "-hostfile $Hostfilename-openmpi";
-}
-
-sub setEnvironment_openmpi
-{
-}
-
-sub executeMPI_openmpi
-{
-    if ($debug) {
-        print "$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
-    }
-
-    system ("$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript");
-}
-#>#
-
-# mvapich2  #<# 
-sub generateNodelist_mvapich2
-{
-}
-
-sub setEnvironment_mvapich2
-{
-    $ENV{'MV2_ENABLE_AFFINITY'}='0';
-}
-
-#tw
-#mvapich2: pinning aus
-# Hybrid programming options:
-#    -ranks-per-proc                  assign so many ranks to each process
-#
-#  Processor topology options:
-#    -binding                         process-to-core binding mode
-#    -topolib                         processor topology library ( hwloc plpa)
-
-sub executeMPI_mvapich2
-{    
-    if ($debug) {
-        print "$MPIEXEC_mvapich2 $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
-    }
-
-    system ("$MPIEXEC_mvapich2 $Hostfile -np $NP -ppn $NperNode ./$WrapperScript");
-
-}
-
-#generate wrapper script
-#mpirank
-#mpitype = mvapich
-
-#>#
-
-# Intel MPI  #<# 
-sub generateNodelist_intelmpi
-{
-    open FILE, ">$Hostfilename-intelmpi";
-
-    #FIXME  Order may be different
-    foreach my $node (keys %NodeList) {
-        print FILE "$node\:$NperNode\n"
-    }
-
-    close FILE;
-
-    $Hostfile = "-f $Hostfilename-intelmpi";
-}
-
-sub setEnvironment_intelmpi
-{
-    $ENV{'I_MPI_PIN'}='off';
-    $ENV{'KMP_AFFINITY'}='disabled';
-}
-
-sub executeMPI_intelmpi
-{
-    if ($debug) {
-        print "$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile \n";
-        print "$MPIROOT_intelmpi/bin/mpiexec -np $NP $WrapperScript \n";
-        print "$MPIROOT_intelmpi/bin/mpdallexit \n";
-    }
-
-    system ("$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile ");
-    system ("$MPIROOT_intelmpi/bin/mpiexec  -perhost $NperNode -np $NP ./$WrapperScript");
-    system ("$MPIROOT_intelmpi/bin/mpdallexit");
-}
-#>#
-
-sub generateHostlist  #<# 
-{
-    $ppnHost = '';
-    open FILE, "<$ENV{'PBS_NODEFILE'}";
-    my @hostArray = <FILE>;
-    close FILE;
-
-    $ppnhost = $hostArray[0];
-    chomp $ppnhost;
-
-    # generate unique host list
-    foreach my $host (@hostArray) {
-        chomp $host;
-        if ($ppnhost eq $host) {
-            $PPN++;
-        }
-        if (not exists $NodeList{$host}) {
-            $NodeList{$host} = 1;
-        }
-    }
-
-    $NumberOfNodes = keys %NodeList;
-}
-#>#
-
-sub usage  #<# 
-{
-    print <<END;
-usage: $0 -np <NUMPROC>
-
-Required:
--np <NUMPROC> : number of MPI processes
-
-Optional:
--h                     : this (help) message
--d                     : debug run
--hostfile <argument>   : Specify nodes if not in in a scheduler
--nperdomain <argument> : Run specified number of processes per domain.
-                         Supported domains are:
-                         N Node
-                         S Socket
-                         C last level cache group
-                         M NUMA domain
--pin <argument>        : Specify pinning for hybrid execution.
-                         Processes are separated by underscore.
-                         The threaded pinning must be a valid likwid-pin list.
--omp <argument>        : Enables support for specific hybrid setup. Use only 
-                         together with -pin option. Currently recognized values: intel
--mpi <argument>        : Specify which mpi implementation should be used. Current recognized 
-                         values: intelmpi, openmpi, mvapich2
---                     : Stop the likwid-mpirun parser. Useful for saving options to
-                         the MPI application.
-
-You can either use -nperdomain OR -pin for specifying pinning.
-For pure MPI pinning use only the nperdomain option. For hybrid use the pin option.
-
-Example: 
-$0 -np 32 ./a.out
-
-$0 will use as many processes per node as available in ppn 
-
-Example with pinning:
-$0 -np 32 -nperdomain S:2 ./a.out
-starts 2 processes per socket.
-
-Example for hybrid run:
-$0 -np 32 -pin M0:0-3_M1:0-3
-starts 2 processes per node. Threads of first process are pinned to first four
-cores in NUMA domain 0. Threads of second process are pinned to first four cores 
-in NUMA domain 1.
-END
-
-exit(0);
-}
-#>#
-
-sub generateDomains  #<# 
-{
-    my $output = `$LIKWIDPIN -p`;
-
-    foreach my $line (split("\n",$output)) {
-        if ($line =~ /Tag ([NSCM])[0-9]*: ([0-9 ]+)/) {
-            if (exists $Domains{$1}) {
-                $Domains{$1}++;
-            } else {
-                $Domains{$1} = 1;
-            }
-
-            if ($1 eq 'N') {
-                $PPN =  split(/ /,$2);
-            }
-        }
-    }
-}
-#>#
-
-sub generateWrapperScript  #<# 
-{
-    my $pinStrings = shift;
-    my $mpiType = shift;
-    open FILE, ">$WrapperScript";
-    my $environment = '';
-    my $doRest = '';
-
-    if ($mpiType eq 'openmpi') {
-        $environment = 'OMPI_COMM_WORLD_RANK';
-    } elsif ($mpiType eq 'intelmpi') {
-        $environment = 'PMI_RANK';
-    } elsif ($mpiType eq 'mvapich2') {
-        $environment = 'PMI_RANK'; #tw maybe????
-    } 
-
-    if ($NP % $NperNode) {
-        my $rest = $NP-($NP % $NperNode);
-        $doRest = "if (\$myRank >= $rest) {\$localId = \$myRank - $rest;}\n";
-    }
-
-    print FILE <<END;
-#!/usr/bin/perl 
-use strict;
-use warnings;
-
-my \$args = join \@ARGV;
-my \$myRank = \$ENV{$environment};
-
-my \$localId = \$myRank \% $NperNode  ;
-
-$doRest
-
-if (\$localId == 0) {
-    system ("$LikwidCall $pinStrings->[0] $PerformanceGroup $OMPType  $cmdline \$args ");
-} 
-END
-
-    foreach my $process ( 1 .. ($NperNode-1) ) {
-    print FILE <<END;
-elsif (\$localId == $process) {
-    system ("$LikwidCall $pinStrings->[$process] $PerformanceGroup $OMPType  $cmdline \$args ");
-} 
-END
-    }
-
-    close FILE;
-}
-#>#
-
-my $pinString = '';
-my $domain = '';
-my @pinStrings;
-
-GetOptions ('np=i'         => \$NP,
-            'nperdomain=s' => \$NperDomain,
-            'hostfile=s'   => \$Hostfilename,
-            'pin=s'        => \$pinString,
-            'mpi=s'        => \$MPIType,
-            'omp=s'        => \$OMPType,
-            'perf=s'       => \$PerformanceGroup,
-            'debug'        => \$debug,
-            'marker'       => sub { $marker = ' -m '; },
-            'help'         => \&usage);
-
-# MPI implementation switch
-$generateNodelist = "generateNodelist_$MPIType";
-$setEnvironment = "setEnvironment_$MPIType";
-$executeMPI = "executeMPI_$MPIType";
-
-generateDomains();
-
-# check for PBS batch system
-if (not defined ($ENV{'PBS_JOBID'})) {
-    readHostfile();
-} else {
-    $NumberOfNodes = `uniq \$PBS_NODEFILE | wc -l`;
-}
-
-if ($pinString) {
-    @pinStrings = split('_',$pinString);
-    $NperNode = ($#pinStrings+1);
-
-    if ($MPIType eq 'openmpi') {
-        if ($OMPType eq 'intel') {
-            $OMPType = '';
-            $OMPType = '-s 0xF';
-        }
-    } elsif ($MPIType eq 'intelmpi') {
-        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
-            $OMPType = '-t intel';
-        } elsif ($OMPType eq 'intel') {
-            $OMPType = '-s 0x7';
-        }
-    }elsif ($MPIType eq 'mvapich2') {
-        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
-            $OMPType = '-t intel';
-        } elsif ($OMPType eq 'intel') {
-            $OMPType = '-s 0x7';
-        }
-    }
-
-} elsif ($NperDomain) {
-
-    $OMPType = '';
-    if ($NperDomain =~ /([NSCM]):([0-9]+)/) {
-        $domain = $1;
-        $NperDomain = $2;
-    } else {
-        die "Parse Error \n";
-    }
-
-    $NperNode = $NperDomain * $Domains{$domain};
-
-    if (not $domain eq 'N') {
-        foreach my $currentDomain ( 0 .. ($Domains{$domain}-1)) {
-            foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
-                push @pinStrings, "$domain"."$currentDomain".":$currentProcess";
-            }
-        }
-    } else {
-        foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
-            push @pinStrings, "$domain".":$currentProcess";
-        }
-    }
-} elsif ($NP) {
-    print "PPN = $PPN\n";
-    $NperNode = $PPN;
-    $OMPType = '';
-
-    foreach my $currentProcess ( 0 .. ($PPN-1)) {
-        push @pinStrings, "N".":$currentProcess";
-    }
-} else {
-    usage();
-}
-
-if (not defined ($ENV{'PBS_JOBID'})) {
-    $Hostfilename .= $$;
-    &{$generateNodelist}();
-} else {
-    if ($MPIType eq 'intelmpi') {
-        $Hostfilename = "pbshosts$$";
-        generateHostlist();
-        &{$generateNodelist}();
-    }
-}
-
-map {$cmdline .= "$_ " ;}  @ARGV;
-$NumberOfUsedNodes = $NP / $NperNode;
-
-if ($NumberOfUsedNodes > $NumberOfNodes) {
-    die "ERROR: Require $NumberOfUsedNodes nodes, but only $NumberOfNodes available!";
-}
-
-if ($NumberOfUsedNodes < 1) {
-    die "ERROR: Requested $NperNode processes per Node with only $NP total processes!";
-}
-
-if ($PerformanceGroup) {
-    $LikwidCall = "$LIKWIDPERF -C";
-    $PerformanceGroup  = ' -g '.$PerformanceGroup ;
-    $PerformanceGroup .= " $marker -o perf_%h_%r.txt ";
-} else {
-    $PerformanceGroup  = ' -q ';
-}
-
-generateWrapperScript(\@pinStrings,$MPIType);
-chmod 0755,$WrapperScript;
-&{$setEnvironment}();
-
-if ($debug) {
-    print  "Number of nodes: $NumberOfNodes \n";
-    $NumberOfUsedNodes = $NP / $NperNode;
-    print  "Number of used nodes: $NumberOfUsedNodes \n";
-    print  "Number of processes per node: $NperNode \n";
-}
-&{$executeMPI}();
-
-if (-e $WrapperScript and not $debug) {
-    unlink ($WrapperScript);
-    unlink ($Hostfilename);
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/likwid-perfscope b/perl/likwid-perfscope
deleted file mode 100755
index 84f99da..0000000
--- a/perl/likwid-perfscope
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-use Getopt::Long;
-
-sub usage  #<# 
-{
-    print <<END;
-usage: $0 --group <Performance Group> --cores <physical core list>
-
-Required:
--cores <CORELIST> : list of physical cores
-
-Optional:
--h                     : this (help) message
--freq                  : frequency of updates, in ms or s (e.g. 500ms), default: 1s
--group <PERFGROUP>     : Specify what to plot, default FLOPS_DP
-
-Example:
-$0 -group FLOPS_DP -cores 0-3 
-END
-
-exit(0);
-}
-#>#
-
-my $CONFIG = {   #<# 
-    "FLOPS_DP" => {
-        "group" => 'FLOPS_DP',
-        "expr" => 'DP MFlops/s',
-        "title" => 'Double Precision Flop Rate',
-        "yaxis" => 'MFlops/s'},
-    "FLOPS_SP" => {
-        "group" => 'FLOPS_SP',
-        "expr" => 'SP MFlops/s',
-        "title" => 'Single Precision Flop Rate',
-        "yaxis" => 'MFlops/s'},
-    "L2" => {
-        "group" => 'L2',
-        "expr" => 'L2 bandwidth [MBytes/s]',
-        "title" => 'L2 cache bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "L3" => {
-        "group" => 'L3',
-        "expr" => 'L3 bandwidth [MBytes/s]',
-        "title" => 'L3 cache bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "CLOCK" => {
-        "group" => 'CLOCK',
-        "expr"  => 'Clock [MHz]',
-        "title" => 'Clock rate',
-        "yaxis" => 'MHz'},
-    "NUMA" => {
-        "group" => 'MEM',
-        "expr" => 'Remote BW [MBytes/s]',
-        "title" => 'Remote NUMA bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "MEM" => {
-        "group" => 'MEM',
-        "expr" => 'MBytes/s',
-        "title" => 'Main memory bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'}};
-#>#
-
-my $FREQ = '1s';
-my $CORES = '';
-my $optGroup = 'FLOPS_DP';
-my $optPlot;
-
-GetOptions ('group=s' => \$optGroup, 'freq=s' => \$FREQ, 'cores=s' => \$CORES, 'plot=s' => \$optPlot, 'help' => \&usage);
-
-my $GROUP = $CONFIG->{$optGroup}->{'group'};
-my $yaxis = $CONFIG->{$optGroup}->{'yaxis'};
-my $title = $CONFIG->{$optGroup}->{'title'};
-my $expr  = $CONFIG->{$optGroup}->{'expr'};
-my $legend = '';
-
-open (INPUT, "likwid-perfctr -g $GROUP -d $FREQ -c $CORES |");
-
-select((select(INPUT), $| = 1)[0]);
-
-while (<INPUT>) {
-    if (/CORES: ([0-9 ]+)/) {
-        my @cores = split ' ',$1;
-        my $coreNumber = 0;
-
-        foreach my $core (@cores) {
-            $legend .= " --legend $coreNumber=\"core $core\" ";
-            $coreNumber++;
-        }
-        last;
-    }
-}
-
-open (OUTPUT, "| feedGnuplot --lines  --domain --stream --xlabel \"seconds\" --ylabel \"$yaxis\" --title \"$title\" $legend");
-
-select((select(OUTPUT), $| = 1)[0]);
-
-while (<INPUT>) {
-    if (/$expr/) {
-        s/$expr//;
-        print OUTPUT;
-    }
-}
-close(INPUT);
-close(OUTPUT);
-
-
-# vim: foldmethod=marker foldmarker=#<#,#># 
diff --git a/perl/likwid-setFrequencies b/perl/likwid-setFrequencies
deleted file mode 100755
index 5834441..0000000
--- a/perl/likwid-setFrequencies
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-#      Filename:  likwid-setFrequencies
-#
-#      Description:  Application allowing to change core frequencies
-#
-#      Version:   <VERSION>
-#      Released:  <DATE>
-#
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
-#      Project:  likwid
-#
-#      Copyright (C) 2014 Jan Treibig
-#
-#      This program is free software: you can redistribute it and/or modify it under
-#      the terms of the GNU General Public License as published by the Free Software
-#      Foundation, either version 3 of the License, or (at your option) any later
-#      version.
-#
-#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
-#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-#
-#      You should have received a copy of the GNU General Public License along with
-#      this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Std;
-
-my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
-my $SYSPATH = '/sys/devices/system/cpu';
-my $SYSCMD = '<PREFIX>/sbin/likwid-setFreq';
-my $domain = 'N';
-my $governor = 'ondemand';
-my @processors;
-my %frequencies;
-my $freq_string;
-use vars qw/ %opt /;
-
-sub init
-{
-    my $opt_string = 'g:c:f:lph';
-    getopts( $opt_string, \%opt ) or usage();
-    usage() if $opt{h};
-    if (scalar(keys %opt) == 0)
-    {
-    	usage();
-    }
-}
-
-sub usage
-{
-    print STDERR << "EOF";
-
-This script allows to switch governors and set fixed
-frequencies on Linux system.
-
-usage: $0 [-hlp] [-g governor] [-c domain] [-f frequency]
--h          : this (help) message
--p          : print current frequencies
--l          : list available frequencies
--c domain   : likwid thread domain which to apply settings
-              (set to N if omitted)
--g governor : set governor (ondemand, performance, turbo)
-              (set to ondemand if omitted)
--f frequency: set fixed frequency, implicitly sets userspace
-              governor
-
-example: $0 -c S0 -f 2.7 (set all CPUs on socket 0 to 2.7 GHz)
-EOF
-    exit;
-}
-
-sub extractAvailableFrequencies
-{
-    my @tmp_keys;
-    open FILE, "<$SYSPATH/cpu0/cpufreq/scaling_available_frequencies";
-    my $tmp = <FILE>;
-    my @list = split(/ /,$tmp);
-    close FILE;
-    $frequencies{'turbo'} = $list[0];
-
-    foreach my $item ( @list ) {
-        if( not $item =~ /\n/ ) {
-            my $key = $item/1000000.0;
-            push @tmp_keys, $key;
-            $frequencies{$key} = $item;
-        }
-    }
-
-    $freq_string = join(' ', sort @tmp_keys);
-}
-
-sub extractProcessorList
-{
-    my $output = `$LIKWIDPIN -p`;
-    my $found = 0;
-
-    foreach my $line (split("\n",$output)) {
-        if ($line =~ /Tag ([NSCM][0-9]*): ([0-9 ]+)/) {
-            if ($domain eq $1) {
-                $found = 1;
-                @processors =  split(/ /,$2);
-                last;
-            }
-        }
-    }
-
-    if ( not $found ) {
-		print "Domain $domain not available!\n";
-        exit;
-    }
-}
-
-
-init();
-
-if (! -s $SYSCMD) {
-    die "ERROR Binary $SYSCMD not existing!\n\n";
-}
-
-if ( defined $opt{c}) {
-    $domain = $opt{c};
-}
-
-extractProcessorList();
-extractAvailableFrequencies();
-
-if ($opt{f}) {
-    $freq = $opt{f};
-
-	if (not exists($frequencies{$freq})) {
-		print "Frequency $freq not available!\nPlease select one of $freq_string\n";
-		exit;
-	}
-
-	foreach my $processID (@processors) {
-#		print "$SYSCMD $processID $frequencies{$freq}\n";
-        system("$SYSCMD $processID $frequencies{$freq}");
-	}
-}
-
-if ($opt{p}) {
-	foreach my $processID (@processors) {
-		open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_governor";
-		my $gov = <FILE>;
-		chomp $gov;
-		close FILE;
-		open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_cur_freq";
-		my $freq = <FILE>;
-		chomp $freq;
-		close FILE;
-		print "CPU $processID: governor $gov frequency $freq\n"
-	}
-	exit;
-}
-
-if ($opt{l}) {
-    print "Available frequencies: $freq_string\n";
-    exit;
-}
-
-if ($opt{g} eq 'turbo') {
-    foreach my $processID (@processors) {
-#        print "$SYSCMD $processID $frequencies{turbo}\n";
-        system("$SYSCMD $processID $frequencies{turbo}");
-    }
-    exit;
-}
-
-if ($opt{g}) {
-    $governor = $opt{g};
-    if (($governor ne "ondemand") and ($governor ne "performance")) {
-        print "Governor $governor not valid\n";
-    } else {
-        print "Set governor in domain $domain to $governor \n";
-        foreach my $processID (@processors) {
-            system("$SYSCMD $processID 0 $governor");
-        }
-    }
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/set_license.pl b/perl/set_license.pl
index f80326d..b14801d 100755
--- a/perl/set_license.pl
+++ b/perl/set_license.pl
@@ -8,15 +8,18 @@ use File::Copy;
 my $mc = '#';
 my $cc = ' *';
 my $fc = '!';
+my $lc = ' *';
 
 #my $VERSION   = '<VERSION>';
 #my $DATE   = '<DATE>';
-my $VERSION   = '3.1.3';
-my $DATE   = '4.11.2014';
-my $YEAR  = '2014';
-my $AUTHOR = 'Jan Treibig';
+my $VERSION   = '4.1';
+my $DATE   = '19.5.2016';
+my $YEAR  = '2016';
+my $AUTHOR = 'RRZE, University Erlangen-Nuremberg';
 my $LICENSE = 'gpl';
 
+my @SKIPLIST = ('ghash.c','ghash.h','loadData.S','bstrlib.c','bstrlib.h', 'calculator_stack.h', 'calculator_stack.c');
+
 sub print_copyright
 {
     my $fh = shift;
@@ -72,108 +75,143 @@ END
     }
 }
 
-sub wanted 
+sub wanted
 {
-	my $filename;
-
-	if (scalar(@_)) {
-		$filename = shift;
-	} else {
-		$filename = $_;
-	}
-
-	if (($filename =~ /^\./) or (-d $_)) {
-		return;
-	}
-
-	my $in_copyright = 0;
-	my $in_header = 0;
-	my $style = $cc;
-	my $enter = 0;
-	open INFILE, "< $filename";
-	open OUTFILE, "> $filename.tmp";
-	print "Process $filename\n";
-
-	while( <INFILE> ) {
-
-		if (/\/\*/ and !$enter) {
-			$style = $cc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "/*\n";
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (/# =/ and !$enter) {
-			$style = $mc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (/! =/ and !$enter) {
-			$style = $fc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (!$enter) {
-			print "Skip $filename: No header found!\n";
-			return;
-		}
-
-		if ($in_header) {
-			if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
-				if ($1 ne $filename) {
-					print "File name mismatch: $filename header says $1\n";
-				}
-				print  OUTFILE "$_";
-			} elsif(/Version:/) {
-				print OUTFILE  "$style      Version:   $VERSION\n";
-			} elsif(/Released:/) {
-				print  OUTFILE "$style      Released:  $DATE\n";
-			} elsif(/Company:/) {
-				#Skip company from header
-			} elsif(/Copyright/) {
-				$in_copyright = 1;
-#				print  OUTFILE "$style\n";
-				print_copyright(\*OUTFILE, $style);
-			} elsif(/# =/ or /! =/) {
-				$in_copyright = 0;
-				$in_header = 0;
-			} elsif (/\*\//) {
-				$in_copyright = 0;
-				$in_header = 0;
-				print  OUTFILE " */\n";
-			} elsif (/\* =/) {
-				# Skip initial hline
-			} else {
-				if($in_copyright eq 0) {
-					print  OUTFILE "$_";
-				}
-			}
-
-		} else {
-			print  OUTFILE "$_";
-		}
-	}
-
-	close INFILE;
-	close OUTFILE;
-
-	unlink $filename or die  "Failed to delete file $filename\n";
-	copy ("$filename.tmp", $filename) or die "Copy failed\n";
-	unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
+    my $filename;
+
+    if (scalar(@_)) {
+        $filename = shift;
+    } else {
+        $filename = $_;
+    }
+
+    if (($filename =~ /^\./) or (-d $filename)) {
+        return;
+    }
+
+    foreach my $filter ( @SKIPLIST ) {
+        if ( $filename eq $filter ) {
+            print "SKIP $filename\n";
+            return;
+        }
+    }
+
+    my $in_copyright = 0;
+    my $in_header = 0;
+    my $style = $cc;
+    my $enter = 0;
+    open INFILE, "< $filename";
+    open OUTFILE, "> $filename.tmp";
+    print "Process $filename\n";
+
+    while( <INFILE> ) {
+        # Ensure UNIX line ending
+        $_ =~ s/\cM\cJ|\cM|\cJ/\n/g;
+
+        if (/\/\*/ and !$enter) {
+            $style = $cc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "/*\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/# =/ and !$enter) {
+            $style = $mc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/! =/ and !$enter) {
+            $style = $fc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/#!/ and !$enter) {
+            $style = $lc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$_";
+            print  OUTFILE "--[[\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/\-\-\[\[/ and !$enter) {
+            $style = $lc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "--[[\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (!$enter) {
+            print "Skip $filename: No header found!\n";
+            unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
+            return;
+        }
+
+        if ($in_header) {
+            if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
+                if ($1 ne $filename) {
+                    print "File name mismatch: $filename header says $1\n";
+                }
+                print  OUTFILE "$_";
+            } elsif(/Version:/) {
+                print OUTFILE  "$style      Version:   $VERSION\n";
+            } elsif(/Released:/) {
+                print  OUTFILE "$style      Released:  $DATE\n";
+            } elsif(/Copyright/) {
+                $in_copyright = 1;
+                print_copyright(\*OUTFILE, $style);
+            } elsif(/# =/ or /! =/) {
+                $in_copyright = 0;
+                $in_header = 0;
+            } elsif (/\*\//) {
+                $in_copyright = 0;
+                $in_header = 0;
+                print  OUTFILE " */\n";
+            } elsif (/\]\]$/) {
+                $in_copyright = 0;
+                $in_header = 0;
+                print  OUTFILE "]]\n";
+            } elsif (/\* =/ or /\-\-\[\[/) {
+                # Skip initial hline
+            } else {
+                if($in_copyright eq 0) {
+                    print  OUTFILE "$_";
+                }
+            }
+        } else {
+            print  OUTFILE "$_";
+        }
+    }
+
+    close INFILE;
+    close OUTFILE;
+
+    unlink $filename or die  "Failed to delete file $filename\n";
+    copy ("$filename.tmp", $filename) or die "Copy failed\n";
+    unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
 }
 
 
 if (defined $ARGV[0]) {
     my $filename = $ARGV[0];
     wanted($filename);
-	exit (0);
+    exit (0);
 }
 
 my @directories;
 push @directories, 'src';
+push @directories, 'bench/src';
+push @directories, 'bench/includes';
+push @directories, 'examples';
 
 find(\&wanted,  @directories);
 
+# single files
+wanted('Makefile');
+chdir 'bench';
+wanted('Makefile');
+wanted('likwid-bench.c');
+
+
 
diff --git a/perl/templates/group.tt b/perl/templates/group.tt
deleted file mode 100644
index 2122caf..0000000
--- a/perl/templates/group.tt
+++ /dev/null
@@ -1,208 +0,0 @@
-/* GENERATED FILE: DO NOTE EDIT */
-
-#define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
-
-[% FOREACH group IN groups %]
-static const char* group_names_[% arch FILTER ucfirst %]_[% group.name %] [] = {[% FOREACH metric IN group.metrics %] "[% metric.label %]", [% END %] NULL};
-[% END %]
-
-static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
-[% FOREACH group IN groups %]
-    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]", 0 [% FOREACH metric IN group.metrics %] +1 [% END %], group_names_[% arch FILTER ucfirst %]_[% group.name %]
-    },
-[% END %]
-};
-
-void perfmon_getDerivedCounterValues[% arch FILTER ucfirst %](PerfmonGroup group, float * values, float * out_max, float * out_min){
-    double time = rdtscTime;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-
-    values[0] = time;
-    out_min[0] = time;
-    out_max[0] = time;
-
-    switch ( group ) {
-    [% FOREACH group IN groups %]
-        case [% group.name %]:{
-            int threadId;
-            int counter = 0;
-            double sum,min,max;
-
-        [% FOREACH metric IN group.metrics %]
-            sum = 0;
-            min = 1e300;
-            max = 0;
-
-            for(threadId=0; threadId < perfmon_numThreads; threadId++)
-            {
-                double cur = [% metric.rule %];
-                cur = isnan(cur) ? 0.0 : cur;
-                sum += cur;
-                max = max > cur ? max : cur;
-                min = min < cur ? min : cur;                        
-            }
-
-            values[counter] = (float) sum / perfmon_numThreads;
-            out_min[counter] = (float) min;
-            out_max[counter] = (float) max;
-            counter++;
-        [% END %]
-        return;        
-        }
-    [% END %]
-
-        default:
-            fprintf (stderr, "perfmon_getDerivedCounterValues[% arch %]: Unknown group! Exiting!\n" );
-            exit (EXIT_FAILURE);
-            break;
-    }
-}
-
-
-void
-perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
-{
-    int threadId;
-    double time = rdtscTime;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-    PerfmonResultTable tableData;
-    int numRows;
-    int numColumns = perfmon_numThreads;
-    bstring label;
-    bstrList* fc;
-    double** stat;
-    double tmpValue;
-    uint64_t cpi_instr = 0;
-    uint64_t cpi_cyc  = 0;
-    int cpi_index = 0;
-
-    switch ( groupId ) 
-    {
-[% FOREACH group IN groups %]
-        case [% group.name %]:
-            numRows = [% group.numRows %];
-            stat = (double**) malloc(numRows * sizeof(double*));
-            for (int i=0; i<numRows; i++)
-            {
-                stat[i] = (double*) malloc(4 * sizeof(double));
-                stat[i][0] = 0;
-                stat[i][1] = 0;
-                stat[i][2] = DBL_MAX;
-            }
-            INIT_BASIC;
-[% FOREACH metric IN group.metrics %]
-            bstrListAdd(fc,[% loop.count %],[% metric.label %]);
-[% END %]
-            initResultTable(&tableData, fc, numRows, numColumns);
-
-            for(threadId=0; threadId < perfmon_numThreads; threadId++)
-            {
-[% FOREACH metric IN group.metrics %]
-                tmpValue = [% metric.rule %];
-                if (!isnan(tmpValue))
-                {
-                    tableData.rows[[% loop.index %]].value[threadId] = tmpValue;
-                }
-                else
-                {
-                    tableData.rows[[% loop.index %]].value[threadId] = 0.0;
-                }
-[% IF metric.label == 'CPI' && arch == 'westmere' %]
-                cpi_instr += perfmon_getResult(threadId,"FIXC0");
-                cpi_cyc += perfmon_getResult(threadId,"FIXC1");
-                cpi_index = [% loop.index %];
-[% ELSE %]
-                stat[[% loop.index %]][0] += (double) tableData.rows[[% loop.index %]].value[threadId];
-[% END %]
-                stat[[% loop.index %]][1] =  MAX(stat[[% loop.index %]][1],(double) tableData.rows[[% loop.index %]].value[threadId]);
-                stat[[% loop.index %]][2] =  MIN(stat[[% loop.index %]][2],(double) tableData.rows[[% loop.index %]].value[threadId]);
-[% END %]
-            }
-
-            if (cpi_instr)
-            {
-                stat[cpi_index][0] = (double) cpi_cyc / (double) cpi_instr;
-            }
-                
-            break;
-[% END %]
-
-        default:
-            fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
-            exit (EXIT_FAILURE);
-            break;
-    }
-
-    printResultTable(&tableData);
-    freeResultTable(&tableData);
-
-    /* for threaded results print sum, max, min and avg */
-    if (perfmon_numThreads > 1)
-    {
-        initStatisticTable(&tableData, fc, numRows);
-        for (int i=0; i<numRows; i++)
-        {
-            stat[i][3] =  stat[i][0]/perfmon_numThreads;
-            for (int j=0; j<4; j++)
-            {
-                tableData.rows[i].value[j] = stat[i][j];
-            }
-        }
-        printResultTable(&tableData);
-        freeResultTable(&tableData);
-    }
-
-    for (int i=0; i<numRows; i++)
-    {
-        free(stat[i]);
-    }
-    free(stat);
-    bstrListDestroy(fc);
-}
-
-void
-perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double time,double timeStamp)
-{
-    int threadId;
-    double tmpValue;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-
-    switch ( group ) 
-    {
-        [% FOREACH group IN groups %]
-        case [% group.name %]:
-
-                    [% FOREACH metric IN group.metrics %]
-                        printf("[% metric.label %] %e ",timeStamp);
-                        for(threadId=0; threadId < perfmon_numThreads; threadId++)
-                        {
-                            tmpValue = [% metric.rule %];
-                            if (!isnan(tmpValue))
-                            {
-                                printf(" %e  ", tmpValue);
-                            }
-                            else
-                            {
-                                printf(" 0.0  ");
-                            }
-                        }
-                        printf("\n");
-                    [% END %]
-            break;
-            [% END %]
-
-        default:
-                fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
-                exit (EXIT_FAILURE);
-                break;
-    }
-}
-
-
-
-static PerfmonGroupHelp [% arch %]_group_help[NUM_GROUPS_[% arch FILTER upper %]] = {
-[% FOREACH group IN groups %]
-    {"[% group.name %]","[% group.longHelp %]"},
-[% END %]
-};
-
diff --git a/perl/templates/testcases.tt b/perl/templates/testcases.tt
deleted file mode 100644
index 1f03a85..0000000
--- a/perl/templates/testcases.tt
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TESTCASES_H
-#define TESTCASES_H
-
-#include <test_types.h>
-
-[% FOREACH test IN Testcases %]
-extern void [% test.name %]();
-[% END %]
-
-#define TESTS  "[% allTests %]"
-#define NUMKERNELS [% numKernels %]
-
-static const TestCase kernels[NUMKERNELS] = {
-    [% FOREACH test IN Testcases %]
-    {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %]},
-    [% END %]
-};
-
-#endif /* TESTCASES_H */
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index afd751b..5af6941 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -4,13 +4,13 @@
 #
 #      Description:  accessDaemon Makefile
 #
-#      Version:   3.1.3
-#      Released:  4.11.2014
+#      Version:   4.1
+#      Released:  19.5.2016
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -31,20 +31,22 @@ include  ../../make/include_$(COMPILER).mk
 
 DAEMON_TARGET = likwid-accessD
 SETFREQ_TARGET = likwid-setFreq
+Q         ?= @
 
-DEFINES   = -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES   += -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -DMAX_NUM_NODES=$(MAX_NUM_NODES)
 INCLUDES  = -I../includes
-ifeq ($(COMPILER),GCC)
-CFLAGS    +=  -pedantic -Wall -Wextra -std=c99
+CFLAGS    += -std=c99 -fPIC -pie -fPIE -fstack-protector
+ifeq ($(COMPILER),GCCX86)
+CFLAGS    +=  -m32
 endif
 CPPFLAGS :=  $(DEFINES) $(INCLUDES)
-Q=
 
 all: $(DAEMON_TARGET) $(SETFREQ_TARGET)
 
 $(DAEMON_TARGET): accessDaemon.c
-	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
+	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
 
 $(SETFREQ_TARGET): setFreq.c
-	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+
 
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index 5679a92..ee875fb 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -5,14 +5,15 @@
  *
  *      Description:  Implementation of access daemon.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
  *      Authors:  Michael Meier, michael.meier at rrze.fau.de
- *                Jan Treibig (jt), jan.treibig at gmail.com
+ *                Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -46,135 +47,53 @@
 #include <sys/fsuid.h>
 #include <getopt.h>
 
-#include <pci_types.h>
+#include <types.h>
+#include <registers.h>
+#include <perfmon_haswellEP_counters.h>
+#include <perfmon_ivybridgeEP_counters.h>
+#include <perfmon_sandybridgeEP_counters.h>
+#include <perfmon_broadwelld_counters.h>
+#include <perfmon_broadwellEP_counters.h>
+#include <topology.h>
+#include <cpuid.h>
 #include <lock.h>
-#include <accessClient_types.h>
+
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define SA struct sockaddr
 #define str(x) #x
 
-#define CHECK_ERROR(func, msg)  \
-    if ((func) < 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-    }
-
 #define CHECK_FILE_ERROR(func, msg)  \
-    if ((func) == 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-    }
-
-
-#define EXIT_IF_ERROR(func, msg)  \
-    if ((func) < 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-        stop_daemon(); \
-        exit(EXIT_FAILURE); \
-    }
-
-
-#define CPUID \
-    __asm__ volatile ("cpuid" \
-            : "=a" (eax), "=b" (ebx) \
-            : "0" (eax))
-
-
-/* Intel P6 */
-#define PENTIUM_M_BANIAS     0x09U
-#define PENTIUM_M_DOTHAN     0x0DU
-#define CORE_DUO             0x0EU
-#define CORE2_65             0x0FU
-#define CORE2_45             0x17U
-#define ATOM                 0x1CU
-#define ATOM_45              0x26U
-#define ATOM_32              0x36U
-#define ATOM_22              0x27U
-#define ATOM_SILVERMONT      0x4DU
-#define NEHALEM              0x1AU
-#define NEHALEM_BLOOMFIELD   0x1AU
-#define NEHALEM_LYNNFIELD    0x1EU
-#define NEHALEM_LYNNFIELD_M  0x1FU
-#define NEHALEM_WESTMERE     0x2CU
-#define NEHALEM_WESTMERE_M   0x25U
-#define SANDYBRIDGE          0x2AU
-#define SANDYBRIDGE_EP       0x2DU
-#define HASWELL              0x3CU
-#define HASWELL_EX           0x3FU
-#define HASWELL_M1           0x45U
-#define HASWELL_M2           0x46U
-#define IVYBRIDGE            0x3AU
-#define IVYBRIDGE_EP         0x3EU
-#define NEHALEM_EX           0x2EU
-#define WESTMERE_EX          0x2FU
-#define XEON_MP              0x1DU
-
-/* Intel MIC */
-#define XEON_PHI           0x01U
-
-/* AMD K10 */
-#define BARCELONA      0x02U
-#define SHANGHAI       0x04U
-#define ISTANBUL       0x08U
-#define MAGNYCOURS     0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB  0x05U
-#define OPTERON_DC_E    0x21U
-#define OPTERON_DC_F    0x41U
-#define ATHLON64_X2     0x43U
-#define ATHLON64_X2_F   0x4BU
-#define ATHLON64_F1     0x4FU
-#define ATHLON64_F2     0x5FU
-#define ATHLON64_X2_G   0x6BU
-#define ATHLON64_G1     0x6FU
-#define ATHLON64_G2     0x7FU
-
-
-#define  P6_FAMILY        0x6U
-#define  MIC_FAMILY       0xBU
-#define  NETBURST_FAMILY  0xFFU
-#define  K15_FAMILY       0x15U
-#define  K16_FAMILY       0x16U
-#define  K10_FAMILY       0x10U
-#define  K8_FAMILY        0xFU
+    if ((func) == 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
+
+
+
+
+
 
 #define PCI_ROOT_PATH    "/proc/bus/pci/"
-#define MAX_PATH_LENGTH   60
-#define MAX_NUM_NODES    4
+#define MAX_PATH_LENGTH   80
+//#define MAX_NUM_NODES    4
 
 /* Lock file controlled from outside which prevents likwid to start.
  * Can be used to synchronize access to the hardware counters
  * with an external monitoring system. */
 
 /* #####   TYPE DEFINITIONS   ########### */
-typedef int (*FuncPrototype)(uint32_t);
+typedef int (*AllowedPrototype)(uint32_t);
+typedef int (*AllowedPciPrototype)(PciDeviceType, uint32_t);
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 static int sockfd = -1;
 static int connfd = -1; /* temporary in to make it compile */
 static char* filepath;
 static const char* ident = "accessD";
-static FuncPrototype allowed = NULL;
+static AllowedPrototype allowed = NULL;
+static AllowedPciPrototype allowedPci = NULL;
 static int FD_MSR[MAX_NUM_THREADS];
-static int FD_PCI[MAX_NUM_NODES][MAX_NUM_DEVICES];
+static int FD_PCI[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
 static int isPCIUncore = 0;
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5",   /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6",   /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1",   /* PCI_R2PCIE_DEVICE */
- "10.0",   /* PCI_IMC_DEVICE_CH_0 */
- "10.1",   /* PCI_IMC_DEVICE_CH_1 */
- "10.4",   /* PCI_IMC_DEVICE_CH_2 */
- "10.5",   /* PCI_IMC_DEVICE_CH_3 */
- "0e.1",   /* PCI_HA_DEVICE */
- "08.2",   /* PCI_QPI_DEVICE_PORT_0 */
- "09.2",   /* PCI_QPI_DEVICE_PORT_1 */
- "08.6",   /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6",   /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0",   /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
-
+static PciDevice* pci_devices_daemon = NULL;
 static char pci_filepath[MAX_PATH_LENGTH];
 
 /* Socket to bus mapping -- will be determined at runtime;
@@ -185,7 +104,7 @@ static char pci_filepath[MAX_PATH_LENGTH];
  *   2                  0xbf
  *   3                  0xff
  */
-static char* socket_bus[MAX_NUM_NODES];
+static char* socket_bus[MAX_NUM_NODES] = { [0 ... (MAX_NUM_NODES-1)] = NULL};
 
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
@@ -198,12 +117,15 @@ static int allowed_intel(uint32_t reg)
             ((reg & 0xF00U) == 0xC00U) ||
             ((reg & 0xF00U) == 0xD00U) ||
             ((reg & 0xF00U) == 0xE00U) ||
+            ((reg & 0xF00U) == 0xF00U) ||
             (reg == 0x1A0)  ||
+            (reg == 0x1A4)  ||
             (reg == 0x0CE)  ||
             (reg == 0x19C)  ||
             (reg == 0x1A2)  ||
             (reg == 0x1AD)  ||
-            (reg == 0x1A6))
+            (reg == 0x1A6)  ||
+            (reg == 0x1A7))
     {
         return 1;
     }
@@ -213,58 +135,159 @@ static int allowed_intel(uint32_t reg)
     }
 }
 
-static int allowed_silvermont(uint32_t reg)
+static int allowed_sandybridge(uint32_t reg)
 {
-    if ( ((reg & 0x0F8U) == 0x0C0U) ||
-            ((reg & 0xFF0U) == 0x180U) ||
-            ((reg & 0xF00U) == 0x300U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0xC00U) ||
-            ((reg & 0xF00U) == 0xD00U) ||
-            (reg == 0x1A0)  ||
-            (reg == 0x0CE)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x19C)  ||
-            (reg == 0x1A2)  ||
-            (reg == 0x1A6) ||
-            (reg == 0x1A6) ||
-            (reg == 0x1A7))
+    if ((allowed_intel(reg)) ||
+        (((reg & 0xF00U) == 0x600U)))
     {
         return 1;
     }
-    else
-    {
-        return 0;
-    }
+    return 0;
 }
 
-static int allowed_westmereEX(uint32_t reg)
+static int allowed_pci_sandybridge(PciDeviceType type, uint32_t reg)
 {
-    if (allowed_intel(reg) == 1)
+    switch (type)
     {
-        return 1;
-    }
-    else if ((reg & 0xF00) == 0xF00)
-    {
-        return 1;
+        case NODEVTYPE:
+            return 1;
+            break;
+        case R3QPI:
+            if ((reg == PCI_UNC_R3QPI_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_R3QPI_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_0) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_1) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_2) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_2_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case R2PCIE:
+            if ((reg == PCI_UNC_R2PCIE_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_R2PCIE_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_0) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_1) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_2) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_3) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_3_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case IMC:
+            if ((reg == PCI_UNC_MC_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_MC_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_MC_PMON_CTL_0) ||
+                (reg == PCI_UNC_MC_PMON_CTL_1) ||
+                (reg == PCI_UNC_MC_PMON_CTL_2) ||
+                (reg == PCI_UNC_MC_PMON_CTL_3) ||
+                (reg == PCI_UNC_MC_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTL) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTR_A) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTR_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case HA:
+            if ((reg == PCI_UNC_HA_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_HA_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_HA_PMON_CTL_0) ||
+                (reg == PCI_UNC_HA_PMON_CTL_1) ||
+                (reg == PCI_UNC_HA_PMON_CTL_2) ||
+                (reg == PCI_UNC_HA_PMON_CTL_3) ||
+                (reg == PCI_UNC_HA_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_HA_PMON_OPCODEMATCH) ||
+                (reg == PCI_UNC_HA_PMON_ADDRMATCH0) ||
+                (reg == PCI_UNC_HA_PMON_ADDRMATCH1))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case QPI:
+            if ((reg == PCI_UNC_QPI_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_QPI_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_QPI_PMON_CTL_0) ||
+                (reg == PCI_UNC_QPI_PMON_CTL_1) ||
+                (reg == PCI_UNC_QPI_PMON_CTL_2) ||
+                (reg == PCI_UNC_QPI_PMON_CTL_3) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_QPI_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_QPI_PMON_MASK_0) ||
+                (reg == PCI_UNC_QPI_PMON_MASK_1) ||
+                (reg == PCI_UNC_QPI_PMON_MATCH_0) ||
+                (reg == PCI_UNC_QPI_PMON_MATCH_1) ||
+                (reg == PCI_UNC_QPI_RATE_STATUS))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case IRP:
+            if ((reg == PCI_UNC_IRP_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_IRP_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_IRP0_PMON_CTL_0) ||
+                (reg == PCI_UNC_IRP0_PMON_CTL_1) ||
+                (reg == PCI_UNC_IRP0_PMON_CTR_0) ||
+                (reg == PCI_UNC_IRP0_PMON_CTR_1) ||
+                (reg == PCI_UNC_IRP1_PMON_CTL_0) ||
+                (reg == PCI_UNC_IRP1_PMON_CTL_1) ||
+                (reg == PCI_UNC_IRP1_PMON_CTR_0) ||
+                (reg == PCI_UNC_IRP1_PMON_CTR_1))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        default:
+            return 0;
+            break;
     }
     return 0;
 }
 
-static int allowed_sandybridge(uint32_t reg)
+static int allowed_haswell(uint32_t reg)
 {
-    if ( ((reg & 0x0F8U) == 0x0C0U) ||
-            ((reg & 0xFF0U) == 0x180U) ||
-            ((reg & 0xF00U) == 0x300U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0xC00U) ||
-            ((reg & 0xF00U) == 0xD00U) ||
-            (reg == 0x1A0)  ||
-            (reg == 0x0CE)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x19C)  ||
-            (reg == 0x1A2)  ||
-            (reg == 0x1A6))
+    if ((allowed_intel(reg)) ||
+        (allowed_sandybridge(reg)) ||
+        (((reg & 0xF00U) == 0x700U)))
     {
         return 1;
     }
@@ -274,22 +297,150 @@ static int allowed_sandybridge(uint32_t reg)
     }
 }
 
-static int allowed_haswell(uint32_t reg)
+static int allowed_pci_haswell(PciDeviceType type, uint32_t reg)
 {
+    switch (type)
+    {
+        case NODEVTYPE:
+            return 1;
+            break;
+        case R3QPI:
+            if ((reg == PCI_UNC_R3QPI_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_R3QPI_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_0) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_1) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTL_2) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_R3QPI_PMON_CTR_2_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case R2PCIE:
+            if ((reg == PCI_UNC_R2PCIE_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_R2PCIE_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_0) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_1) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_2) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTL_3) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_R2PCIE_PMON_CTR_3_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case IMC:
+            if ((reg == PCI_UNC_MC_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_MC_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_MC_PMON_CTL_0) ||
+                (reg == PCI_UNC_MC_PMON_CTL_1) ||
+                (reg == PCI_UNC_MC_PMON_CTL_2) ||
+                (reg == PCI_UNC_MC_PMON_CTL_3) ||
+                (reg == PCI_UNC_MC_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_MC_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_MC_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTL) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTR_A) ||
+                (reg == PCI_UNC_MC_PMON_FIXED_CTR_B))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case HA:
+            if ((reg == PCI_UNC_HA_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_HA_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_HA_PMON_CTL_0) ||
+                (reg == PCI_UNC_HA_PMON_CTL_1) ||
+                (reg == PCI_UNC_HA_PMON_CTL_2) ||
+                (reg == PCI_UNC_HA_PMON_CTL_3) ||
+                (reg == PCI_UNC_HA_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_HA_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_HA_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_HA_PMON_OPCODEMATCH) ||
+                (reg == PCI_UNC_HA_PMON_ADDRMATCH0) ||
+                (reg == PCI_UNC_HA_PMON_ADDRMATCH1))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        case QPI:
+            if ((reg == PCI_UNC_V3_QPI_PMON_BOX_CTL) ||
+                (reg == PCI_UNC_V3_QPI_PMON_BOX_STATUS) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTL_0) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTL_1) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTL_2) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTL_3) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_0_A) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_1_A) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_2_A) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_3_A) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_0_B) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_1_B) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_2_B) ||
+                (reg == PCI_UNC_V3_QPI_PMON_CTR_3_B) ||
+                (reg == PCI_UNC_V3_QPI_PMON_RX_MASK_0) ||
+                (reg == PCI_UNC_V3_QPI_PMON_RX_MASK_1) ||
+                (reg == PCI_UNC_V3_QPI_PMON_RX_MATCH_0) ||
+                (reg == PCI_UNC_V3_QPI_PMON_RX_MATCH_1) ||
+                (reg == PCI_UNC_V3_QPI_PMON_TX_MASK_0) ||
+                (reg == PCI_UNC_V3_QPI_PMON_TX_MASK_1) ||
+                (reg == PCI_UNC_V3_QPI_PMON_TX_MATCH_0) ||
+                (reg == PCI_UNC_V3_QPI_PMON_TX_MATCH_1) ||
+                (reg == PCI_UNC_V3_QPI_RATE_STATUS) ||
+                (reg == PCI_UNC_V3_QPI_LINK_LLR) ||
+                (reg == PCI_UNC_V3_QPI_LINK_IDLE))
+            {
+                return 1;
+            }
+            return 0;
+            break;
+        default:
+            return 0;
+            break;
+    }
+    return 0;
+}
+
+static int allowed_silvermont(uint32_t reg)
+{
+
     if ( ((reg & 0x0F8U) == 0x0C0U) ||
             ((reg & 0xFF0U) == 0x180U) ||
             ((reg & 0xF00U) == 0x300U) ||
+            ((reg & 0xF00U) == 0x600U) ||
             ((reg & 0xF00U) == 0xC00U) ||
             ((reg & 0xF00U) == 0xD00U) ||
-            ((reg & 0xF00U) == 0xE00U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0x700U) ||
             (reg == 0x1A0)  ||
             (reg == 0x0CE)  ||
+            (reg == 0x1AD)  ||
             (reg == 0x19C)  ||
             (reg == 0x1A2)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x1A6))
+            (reg == 0x1A6) ||
+            (reg == 0x1A7))
     {
         return 1;
     }
@@ -347,25 +498,25 @@ static void msr_read(AccessDataRecord * dRecord)
     dRecord->errorcode = ERR_NOERROR;
     dRecord->data = 0;
 
-    if (FD_MSR[cpu] == -2)
+    if (FD_MSR[cpu] <= 0)
     {
         dRecord->errorcode = ERR_NODEV;
         return;
     }
+
     if (!allowed(reg))
     {
-        syslog(LOG_ERR, "attempt to read from restricted register 0x%x", reg);
         dRecord->errorcode = ERR_RESTREG;
         return;
     }
 
     if (pread(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to read data from msr device file on core %u", cpu);
+        syslog(LOG_ERR, "Failed to read data to register 0x%x on core %u", reg, cpu);
+        syslog(LOG_ERR, "%s", strerror(errno));
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
-
     dRecord->data = data;
 }
 
@@ -376,8 +527,8 @@ static void msr_write(AccessDataRecord * dRecord)
     uint64_t data = dRecord->data;
 
     dRecord->errorcode = ERR_NOERROR;
-
-    if (FD_MSR[cpu] == -2)
+    
+    if (FD_MSR[cpu] <= 0)
     {
         dRecord->errorcode = ERR_NODEV;
         return;
@@ -385,19 +536,33 @@ static void msr_write(AccessDataRecord * dRecord)
 
     if (!allowed(reg))
     {
-        syslog(LOG_ERR, "attempt to write to restricted register %x", reg);
+        syslog(LOG_ERR, "Attempt to write to restricted register 0x%x on core %u", reg, cpu);
         dRecord->errorcode = ERR_RESTREG;
         return;
     }
 
     if (pwrite(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to write data to msr device file on core %u", cpu);
+        syslog(LOG_ERR, "Failed to write data to register 0x%x on core %u", reg, cpu);
+        syslog(LOG_ERR, "%s", strerror(errno));
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
 }
 
+static void msr_check(AccessDataRecord * dRecord)
+{
+    uint32_t cpu = dRecord->cpu;
+    dRecord->errorcode = ERR_NOERROR;
+
+    if (FD_MSR[cpu] < 0)
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
+    return;
+}
+
 static void pci_read(AccessDataRecord* dRecord)
 {
     uint32_t socketId = dRecord->cpu;
@@ -413,26 +578,35 @@ static void pci_read(AccessDataRecord* dRecord)
         dRecord->errorcode = ERR_NODEV;
         return;
     }
-    else if ( !FD_PCI[socketId][device] )
-    {
-        strncpy(pci_filepath, PCI_ROOT_PATH, 30);
-        strncat(pci_filepath, socket_bus[socketId], 10);
-        strncat(pci_filepath, pci_DevicePath[device], 20);
 
+    if (allowedPci)
+    {
+        if (!allowedPci(pci_devices_daemon[device].type, reg))
+        {
+        dRecord->errorcode = ERR_RESTREG;
+        return;
+        }
+    }
+    if ( !FD_PCI[socketId][device] )
+    {
+        snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%s%s", PCI_ROOT_PATH, socket_bus[socketId], pci_devices_daemon[device].path);
         FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
 
         if ( FD_PCI[socketId][device] < 0)
         {
-            syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+            syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
             dRecord->errorcode = ERR_OPENFAIL;
             return;
         }
+        syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
     }
 
-    if ( pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
+    if (FD_PCI[socketId][device] > 0 && pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to read data from pci device file on socket %u device %u",
-                socketId, device);
+        syslog(LOG_ERR, "Failed to read data from pci device file %s for device %s (%s) on socket %u",
+                pci_filepath,pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name,socketId);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
@@ -450,40 +624,65 @@ static void pci_write(AccessDataRecord* dRecord)
     uint32_t data = (uint32_t) dRecord->data;
 
     dRecord->errorcode = ERR_NOERROR;
+
     if (FD_PCI[socketId][device] == -2)
     {
         dRecord->errorcode = ERR_NODEV;
         return;
     }
-    else if ( !FD_PCI[socketId][device] )
+
+    if (allowedPci)
+    {
+        if (!allowedPci(pci_devices_daemon[device].type, reg))
+        {
+        dRecord->errorcode = ERR_RESTREG;
+        return;
+        }
+    }
+
+    if ( !FD_PCI[socketId][device] )
     {
-        strncpy(pci_filepath, PCI_ROOT_PATH, 30);
-        strncat(pci_filepath, socket_bus[socketId], 10);
-        strncat(pci_filepath, pci_DevicePath[device], 20);
+        snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%s%s", PCI_ROOT_PATH, socket_bus[socketId], pci_devices_daemon[device].path);
 
         FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
 
         if ( FD_PCI[socketId][device] < 0)
         {
-            syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+            syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+                        pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
             dRecord->errorcode = ERR_OPENFAIL;
             return;
         }
+        syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
     }
 
-    if (pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
+    if (FD_PCI[socketId][device] > 0 && pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
     {
-        syslog(LOG_ERR, "Failed to write data to pci device file on socket %u", socketId);
+        syslog(LOG_ERR, "Failed to write data to pci device file %s for device %s (%s) on socket %u",pci_filepath,
+                pci_types[pci_devices_daemon[device].type].name, pci_devices_daemon[device].name, socketId);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
 }
 
 
-static void kill_client(void)
+static void pci_check(AccessDataRecord* dRecord)
 {
-    syslog(LOG_NOTICE, "daemon dropped client");
+    uint32_t socketId = dRecord->cpu;
+    uint32_t device = dRecord->device;
+    dRecord->errorcode = ERR_NOERROR;
 
+    if (FD_PCI[socketId][device] == -2)
+    {
+        dRecord->errorcode = ERR_NODEV;
+        return;
+    }
+    return;
+}
+
+static void kill_client(void)
+{
     if (connfd != -1)
     {
         CHECK_ERROR(close(connfd), socket close failed);
@@ -495,7 +694,13 @@ static void kill_client(void)
 static void stop_daemon(void)
 {
     kill_client();
-    syslog(LOG_NOTICE, "daemon exiting");
+    for (int i=0;i<MAX_NUM_NODES;i++)
+    {
+        if (socket_bus[i] != NULL)
+        {
+            free(socket_bus[i]);
+        }
+    }
 
     if (sockfd != -1)
     {
@@ -507,6 +712,41 @@ static void stop_daemon(void)
     exit(EXIT_SUCCESS);
 }
 
+int getBusFromSocket(const uint32_t socket)
+{
+    int cur_bus = 0;
+    uint32_t cur_socket = 0;
+    char pci_filepath[1024];
+    int fp;
+    int ret = 0;
+    while(cur_socket <= socket)
+    {
+        snprintf(pci_filepath, MAX_PATH_LENGTH-1, "%s%02x/05.0", PCI_ROOT_PATH, cur_bus);
+        fp = open(pci_filepath, O_RDONLY);
+        if (fp < 0)
+        {
+            return -1;
+        }
+        uint32_t cpubusno = 0;
+        ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+        if (ret != sizeof(uint32_t))
+        {
+            close(fp);
+            return -1;
+        }
+        cur_bus = (cpubusno >> 8) & 0x0ff;
+        close(fp);
+        if(socket == cur_socket)
+            return cur_bus;
+        ++cur_socket;
+        ++cur_bus;
+        if(cur_bus > 0x0ff)
+           return -1;
+    }
+
+    return -1;
+}
+
 static void Signal_Handler(int sig)
 {
     if (sig == SIGPIPE)
@@ -516,7 +756,7 @@ static void Signal_Handler(int sig)
     }
 
     /* For SIGALRM we just return - we're just here to create a EINTR */
-    if ((sig == SIGTERM))
+    if (sig == SIGTERM)
     {
         stop_daemon();
     }
@@ -543,7 +783,7 @@ static void daemonize(int* parentPid)
     /* If we got a good PID, then we can exit the parent process. */
     if (pid > 0)
     {
-        exit(ERR_NOERROR);
+        exit(EXIT_SUCCESS);
     }
 
     /* At this point we are executing as the child process */
@@ -585,90 +825,107 @@ int main(void)
     mode_t oldumask;
     uint32_t numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
     uint32_t model;
-    int isIntel = 1;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to performance counters is locked. Exiting!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    for ( uint32_t i=0; i < numHWThreads; i++ )
+    for (int i=0;i<MAX_NUM_THREADS;i++)
     {
         FD_MSR[i] = -1;
     }
 
-    uint32_t  eax = 0x00;
-    uint32_t  ebx = 0x00;
-    
-    CPUID;
-    if (ebx == 0x68747541U)
+    openlog(ident, 0, LOG_USER);
+
+    if (!lock_check())
     {
-        isIntel = 0;
+        syslog(LOG_ERR,"Access to performance counters is locked.\n");
+        stop_daemon();
     }
 
-    eax = 0x01;
-    CPUID;
-    uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
-    model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+    daemonize(&pid);
 
-    switch (family)
     {
-        case P6_FAMILY:
-            allowed = allowed_intel;
+        uint32_t  eax = 0x00;
+        uint32_t  ebx = 0x00;
+        uint32_t  ecx = 0x00;
+        uint32_t  edx = 0x00;
+        /*int isIntel = 1;
+        CPUID(eax, ebx, ecx, edx);
+        if (ebx == 0x68747541U)
+        {
+            isIntel = 0;
+        }*/
 
-            if (isIntel && ((model == SANDYBRIDGE)    ||
-                            (model == SANDYBRIDGE_EP) ||
-                            (model == IVYBRIDGE)      ||
-                            (model == IVYBRIDGE_EP) ))
-            {
-                allowed = allowed_sandybridge;
-                isPCIUncore = 1;
-            }
-            else if (isIntel && ((model == HASWELL)    ||
-                                 (model == HASWELL_M1) ||
-                                 (model == HASWELL_M2) ||
-                                 (model == HASWELL_EX)))
-            {
-                allowed = allowed_haswell;
-            }
-            else if (isIntel && (model == ATOM_SILVERMONT))
-            {
-                allowed = allowed_silvermont;
-            }
-            else if (isIntel && (model == WESTMERE_EX))
-            {
-                allowed = allowed_westmereEX;
-            }
-            break;
-        case K8_FAMILY:
-        case K10_FAMILY:
-            if (!isIntel) 
-            {
+        eax = 0x01;
+        CPUID(eax, ebx, ecx, edx);
+        uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
+        model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+
+        switch (family)
+        {
+            case P6_FAMILY:
+                allowed = allowed_intel;
+
+                if ((model == SANDYBRIDGE) || (model == IVYBRIDGE))
+                {
+                    allowed = allowed_sandybridge;
+                }
+                else if ((model == SANDYBRIDGE_EP) || (model == IVYBRIDGE_EP))
+                {
+                    allowed = allowed_sandybridge;
+                    allowedPci = allowed_pci_sandybridge;
+                    isPCIUncore = 1;
+                }
+                else if ((model == HASWELL) ||
+                         (model == HASWELL_M1) ||
+                         (model == HASWELL_M2) ||
+                         (model == BROADWELL) ||
+                         (model == SKYLAKE1) ||
+                         (model == SKYLAKE2))
+                {
+                    allowed = allowed_haswell;
+                }
+                else if (model == BROADWELL_D)
+                {
+                    allowed = allowed_haswell;
+                    isPCIUncore = 1;
+                    allowedPci = allowed_pci_haswell;
+                }
+                else if (model == HASWELL_EP)
+                {
+                    isPCIUncore = 1;
+                    allowed = allowed_haswell;
+                    allowedPci = allowed_pci_haswell;
+                }
+                else if (model == BROADWELL_E)
+                {
+                    isPCIUncore = 1;
+                    allowed = allowed_haswell;
+                    allowedPci = allowed_pci_haswell;
+                }
+                else if ((model == ATOM_SILVERMONT_C) ||
+                         (model == ATOM_SILVERMONT_E) ||
+                         (model == ATOM_SILVERMONT_Z1) ||
+                         (model == ATOM_SILVERMONT_Z2) ||
+                         (model == ATOM_SILVERMONT_F) ||
+                         (model == ATOM_SILVERMONT_AIR))
+                {
+                    allowed = allowed_silvermont;
+                }
+                break;
+            case K8_FAMILY:
+            case K10_FAMILY:
                 allowed = allowed_amd;
-            }
-            break;
-        case K15_FAMILY:
-            if (!isIntel) 
-            {
+                break;
+            case K15_FAMILY:
                 allowed = allowed_amd15;
-            }
-            break;
-        case K16_FAMILY:
-            if (!isIntel) 
-            {
+                break;
+            case K16_FAMILY:
                 allowed = allowed_amd16;
-            }
             break;
-        default:
-            fprintf(stderr, "ERROR - [%s:%d] - Unsupported processor. Exiting!\n",
-                    __FILE__, __LINE__);
-            exit(EXIT_FAILURE);
+            default:
+                syslog(LOG_ERR, "ERROR - [%s:%d] - Unsupported processor. Exiting!  \n",
+                        __FILE__, __LINE__);
+                exit(EXIT_FAILURE);
+        }
     }
 
-    openlog(ident, 0, LOG_USER);
-    daemonize(&pid);
-
     /* setup filename for socket */
     filepath = (char*) calloc(sizeof(addr1.sun_path), 1);
     snprintf(filepath, sizeof(addr1.sun_path), "/tmp/likwid-%d", pid);
@@ -691,10 +948,6 @@ int main(void)
     EXIT_IF_ERROR(listen(sockfd, 1), listen failed);
     EXIT_IF_ERROR(chmod(filepath, S_IRUSR|S_IWUSR), chmod failed);
 
-    /* Restore the old umask and fs ids. */
-    (void) umask(oldumask);
-    CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
-
     socklen = sizeof(addr1);
 
     { /* Init signal handler */
@@ -707,8 +960,6 @@ int main(void)
         sigaction(SIGTERM, &sia, NULL);
     }
 
-    syslog(LOG_NOTICE, "daemon started");
-
     /* setup an alarm to stop the daemon if there is no connect.*/
     alarm(15U);
 
@@ -728,7 +979,10 @@ int main(void)
 
     alarm(0);
     CHECK_ERROR(unlink(filepath), unlink of socket failed);
-    syslog(LOG_NOTICE, "daemon accepted client");
+
+    /* Restore the old umask and fs ids. */
+    (void) umask(oldumask);
+    CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
 
     {
         char* msr_file_name = (char*) malloc(MAX_PATH_LENGTH * sizeof(char));
@@ -737,75 +991,77 @@ int main(void)
          * NOTICE: This assumes consecutive processor Ids! */
         for ( uint32_t i=0; i < numHWThreads; i++ )
         {
-#ifdef __MIC
-            sprintf(msr_file_name,"/dev/msr%d",i);
-            if (access(msr_file_name, F_OK))
-            {
-                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-            }
-#else
             sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-#endif
             FD_MSR[i] = open(msr_file_name, O_RDWR);
 
             if ( FD_MSR[i] < 0 )
             {
-                syslog(LOG_ERR, "Failed to open device file %s.",msr_file_name);
-                FD_MSR[i] = -2;
+                syslog(LOG_ERR, "Failed to open device file %s: %s, trying /dev/msr%d", msr_file_name, strerror(errno), i);
+                sprintf(msr_file_name,"/dev/msr%d",i);
+                FD_MSR[i] = open(msr_file_name, O_RDWR);
+                if ( FD_MSR[i] < 0 )
+                {
+                    syslog(LOG_ERR, "Failed to open device file %s: %s.", msr_file_name, strerror(errno));
+                }
             }
         }
 
         free(msr_file_name);
-
         if (isPCIUncore)
         {
-            for (int j=0; j<MAX_NUM_NODES; j++)
-            {
-                socket_bus[j] = "N-A";
-                for (int i=0; i<MAX_NUM_DEVICES; i++)
-                {
-                    FD_PCI[j][i] = -2;
-                }
-            }
-
-            /* determine PCI-BUSID mapping ... */
-            FILE *fptr;
-            char buf[1024];
-            uint32_t testDevice;
-            uint32_t sbus, sdevfn, svend;
             int cntr = 0;
             int socket_count = 0;
-
             if (model == SANDYBRIDGE_EP)
             {
-                testDevice = 0x80863c44;
+                //testDevice = 0x80863c44;
+                pci_devices_daemon = sandybridgeEP_pci_devices;
             }
             else if (model == IVYBRIDGE_EP)
             {
-                testDevice = 0x80860e36;
+                //testDevice = 0x80860e36;
+                pci_devices_daemon = ivybridgeEP_pci_devices;
             }
-            else
+            else if (model == HASWELL_EP)
             {
-                testDevice = 0;
-                syslog(LOG_NOTICE, "PCI Uncore not supported on this system");
+                //testDevice = 0x80862f30;
+                pci_devices_daemon = haswellEP_pci_devices;
             }
-
-            if ( ((fptr = fopen("/proc/bus/pci/devices", "r")) == NULL) || !testDevice)
+            else if (model == BROADWELL_D)
+            {
+                //testDevice = 0x80862f30;
+                pci_devices_daemon = broadwelld_pci_devices;
+            }
+            else if (model == BROADWELL_E)
             {
-                syslog(LOG_NOTICE, "Unable to open /proc/bus/pci/devices");
+                //testDevice = 0x80862f30;
+                pci_devices_daemon = broadwellEP_pci_devices;
             }
             else
             {
-                while( fgets(buf, sizeof(buf)-1, fptr) )
+                //testDevice = 0;
+                syslog(LOG_NOTICE, "PCI Uncore not supported on this system");
+                goto LOOP;
+            }
+
+            for (int j=0; j<MAX_NUM_NODES; j++)
+            {
+                socket_bus[j] = (char*)malloc(4);
+                sprintf(socket_bus[j], "N-A");
+                for (int i=0; i<MAX_NUM_PCI_DEVICES; i++)
                 {
-                    if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
-                            svend == testDevice )
-                    {
-                        socket_bus[cntr] = (char*)malloc(4);
-                        sprintf(socket_bus[cntr++], "%02x/", sbus);
-                    }
+                    FD_PCI[j][i] = -2;
                 }
-                fclose(fptr);
+            }
+
+            /* determine PCI-BUSID mapping ... */
+            int sbus = -1;
+            cntr = 0;
+            sbus = getBusFromSocket(cntr);
+            while (sbus != -1)
+            {
+                sprintf(socket_bus[cntr], "%02x/", sbus);
+                cntr++;
+                sbus = getBusFromSocket(cntr);
             }
 
             if ( cntr == 0 )
@@ -815,38 +1071,41 @@ int main(void)
             else
             {
                 socket_count = cntr;
-
+                int fd;
                 for (int j=0; j<socket_count; j++)
                 {
-                    for (int i=0; i<MAX_NUM_DEVICES; i++)
+                    for (int i=1; i<MAX_NUM_PCI_DEVICES; i++)
                     {
-                        sprintf(pci_filepath, "%s%s%s",PCI_ROOT_PATH,socket_bus[j],pci_DevicePath[i]);
-
-                        if (!access(pci_filepath,F_OK))
-                        {
-                            FD_PCI[j][i] = 0;
-                        }
-                        else
+                        if (pci_devices_daemon[i].path)
                         {
-                            syslog(LOG_NOTICE, "Device %s not found, excluded it from device list\n",pci_filepath);
+                            sprintf(pci_filepath, "%s%s%s", PCI_ROOT_PATH, socket_bus[j], pci_devices_daemon[i].path);
+                            fd = open(pci_filepath, O_RDWR);
+                            if (fd > 0)
+                            {
+                                FD_PCI[j][i] = 0;
+                                pci_devices_daemon[i].online = 1;
+                                close(fd);
+                            }
+                            else if (j==0)
+                            {
+                                syslog(LOG_NOTICE, "Device %s for socket %d not found at path %s, excluded it from device list: %s\n",pci_devices_daemon[i].name,j, pci_filepath, strerror(errno));
+                            }
                         }
                     }
                 }
             }
         }
     }
-
+LOOP:
     while (1)
     {
         ret = read(connfd, (void*) &dRecord, sizeof(AccessDataRecord));
 
         if (ret < 0)
         {
-            syslog(LOG_ERR, "ERROR - [%s:%d] read from client failed  - %s \n",
-                    __FILE__, __LINE__, strerror(errno));
             stop_daemon();
         }
-        else if (ret == 0)
+        else if ((ret == 0) && (dRecord.type != DAEMON_EXIT))
         {
             syslog(LOG_ERR, "ERROR - [%s:%d] zero read", __FILE__, __LINE__);
             stop_daemon();
@@ -860,7 +1119,7 @@ int main(void)
 
         if (dRecord.type == DAEMON_READ)
         {
-            if (dRecord.device == DAEMON_AD_MSR)
+            if (dRecord.device == MSR_DEV)
             {
                 msr_read(&dRecord);
             }
@@ -871,7 +1130,7 @@ int main(void)
         }
         else if (dRecord.type == DAEMON_WRITE)
         {
-            if (dRecord.device == DAEMON_AD_MSR)
+            if (dRecord.device == MSR_DEV)
             {
                 msr_write(&dRecord);
                 dRecord.data = 0x0ULL;
@@ -882,6 +1141,17 @@ int main(void)
                 dRecord.data = 0x0ULL;
             }
         }
+        else if (dRecord.type == DAEMON_CHECK)
+        {
+            if (dRecord.device == MSR_DEV)
+            {
+                msr_check(&dRecord);
+            }
+            else
+            {
+                pci_check(&dRecord);
+            }
+        }
         else if (dRecord.type == DAEMON_EXIT)
         {
             stop_daemon();
diff --git a/src/access-daemon/setFreq.c b/src/access-daemon/setFreq.c
index 967dbbf..6802449 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/access-daemon/setFreq.c
@@ -1,18 +1,18 @@
 /*
  * =======================================================================================
- *
- *      Filename:  setFreq.c
- *
- *      Description:  Wrapper for accessing setfreq kernel FS files
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Authors:  Michael Meier, michael.meier at rrze.fau.de
- *                Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
+ *
+ *      Filename:  setFreq.c
+ *
+ *      Description:  Implementation of frequency daemon
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,103 +28,168 @@
  *
  * =======================================================================================
  */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-static int get_numCPUs()
-{
-    int cpucount = 0;
-    char line[1024];
-    FILE* fp = fopen("/proc/cpuinfo","r");
-    if (fp != NULL)
-    {
-        while( fgets(line,1024,fp) )
-        {
-            if (strncmp(line, "processor", 9) == 0)
-            {
-                cpucount++;
-            }
-        }
-    }
-    return cpucount;
-}
-
-int main (int argn, char** argv)
-{
-    int cpuid;
-    int freq;
-    int numCPUs = 0;
-    char* gov;
-    char* gpath = malloc(100);
-    char* fpath = malloc(100);
-    FILE* f;
-
-    if (argn < 3 || argn > 4)
-    {
-        fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
-        exit(EXIT_FAILURE);
-    }
-
-    cpuid = atoi(argv[1]);
-    numCPUs = get_numCPUs();
-    if (cpuid < 0 || cpuid > numCPUs)
-    {
-        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n",cpuid,numCPUs);
-        exit(EXIT_FAILURE);
-    }
-    freq  = atoi(argv[2]);
-    if (freq < 0)
-    {
-        fprintf(stderr, "Frequency must be greater than 0.\n");
-        exit(EXIT_FAILURE);
-    }
-    snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
-    snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
-
-    if (argn == 4)
-    {
-        gov = argv[3];
-
-        if ((strncmp(gov,"ondemand",12)) && (strncmp(gov,"performance",12)))
-        {
-            fprintf(stderr, "Invalid governor %s!\n",gov);
-            return (EXIT_FAILURE);
-        }
-
-        f = fopen(gpath, "w");
-        if (f == NULL)
-        {
-            fprintf(stderr, "Unable to open path for writing\n");
-            return (EXIT_FAILURE);
-        }
-        fprintf(f,"%s",gov);
-        fclose(f);
-        return(EXIT_SUCCESS);
-    }
-    else
-    {
-        f = fopen(gpath, "w");
-        if (f == NULL)
-        {
-            fprintf(stderr, "Unable to open path for writing\n");
-            return (EXIT_FAILURE);
-        }
-        fprintf(f,"userspace");
-        fclose(f);
-    }
-
-    f = fopen(fpath, "w");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Unable to open path for writing\n");
-        return (EXIT_FAILURE);
-    }
-    fprintf(f,"%d",freq);
-    fclose(f);
-
-    return(EXIT_SUCCESS);
-}
-
-
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+char setfiles[3][100] = {"scaling_min_freq", "scaling_max_freq", "scaling_setspeed"};
+char getfiles[3][100] = {"cpuinfo_min_freq", "cpuinfo_max_freq", "cpuinfo_cur_freq"};
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int get_numCPUs()
+{
+    int cpucount = 0;
+    char line[1024];
+    FILE* fp = fopen("/proc/cpuinfo","r");
+    if (fp != NULL)
+    {
+        while( fgets(line,1024,fp) )
+        {
+            if (strncmp(line, "processor", 9) == 0)
+            {
+                cpucount++;
+            }
+        }
+    }
+    return cpucount;
+}
+
+/* #####  MAIN FUNCTION DEFINITION   ################## */
+int main (int argn, char** argv)
+{
+    int i = 0;
+    int tmp;
+    int cpuid;
+    int freq = 0;
+    int numCPUs = 0;
+    char* gov;
+    char* gpath = malloc(100);
+    char* fpath = malloc(100);
+
+    if (argn < 3 || argn > 4)
+    {
+        fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+
+    cpuid = atoi(argv[1]);
+    numCPUs = get_numCPUs();
+    if (cpuid < 0 || cpuid > numCPUs)
+    {
+        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n", cpuid, numCPUs);
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+    freq  = atoi(argv[2]);
+    if (freq <= 0)
+    {
+        fprintf(stderr, "Frequency must be greater than 0.\n");
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+
+    if (argn == 4)
+    {
+        FILE* f;
+        gov = argv[3];
+
+        if ((strncmp(gov,"ondemand",8) != 0) &&
+            (strncmp(gov,"performance",11) != 0) &&
+            (strncmp(gov,"conservative",12) != 0) &&
+            (strncmp(gov,"powersave",9) != 0)) {
+            fprintf(stderr, "Invalid governor %s!\n",gov);
+            free(gpath);
+            free(fpath);
+            return (EXIT_FAILURE);
+        }
+        
+        for (i=0; i<2; i++)
+        {
+            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, getfiles[i]);
+            f = fopen(fpath, "r");
+            if (f == NULL) {
+                fprintf(stderr, "Unable to open path %s for writing\n", fpath);
+                free(gpath);
+                free(fpath);
+                return (EXIT_FAILURE);
+            }
+            tmp = fread(fpath, 100, sizeof(char), f);
+            freq = atoi(fpath);
+            fclose(f);
+            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[i]);
+            f = fopen(fpath, "w");
+            if (f == NULL) {
+                fprintf(stderr, "Unable to open path %s for writing\n",fpath);
+                free(gpath);
+                free(fpath);
+                return (EXIT_FAILURE);
+            }
+            fprintf(f,"%d",freq);
+            fclose(f);
+
+        }
+        snprintf(gpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+
+        f = fopen(gpath, "w");
+        if (f == NULL) {
+            fprintf(stderr, "Unable to open path %s for writing\n", gpath);
+            free(gpath);
+            free(fpath);
+            return (EXIT_FAILURE);
+        }
+        fprintf(f,"%s",gov);
+        fclose(f);
+        free(gpath);
+        free(fpath);
+        return(EXIT_SUCCESS);
+    }
+
+    snprintf(gpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+
+    FILE* f = fopen(gpath, "w");
+    if (f == NULL) {
+        fprintf(stderr, "Unable to open path %s for writing\n", gpath);
+        free(gpath);
+        free(fpath);
+        return (EXIT_FAILURE);
+    }
+    if ((argn == 4) &&
+        ((strncmp(argv[3],"ondemand",8) == 0) ||
+        (strncmp(argv[3],"performance",11) == 0) ||
+        (strncmp(argv[3],"conservative",12) == 0) ||
+        (strncmp(argv[3],"powersave",9) == 0)))
+    {
+        fprintf(f, "%s", argv[3]);
+        tmp = 1;
+    }
+    else
+    {
+        fprintf(f, "%s", "userspace");
+        tmp = 3;
+    }
+    fclose(f);
+
+    for (i=0;i<tmp;i++)
+    {
+        snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[i]);
+        f = fopen(fpath, "w");
+        if (f == NULL) {
+            fprintf(stderr, "Unable to open path %s for writing\n",fpath);
+            free(gpath);
+            free(fpath);
+            return (EXIT_FAILURE);
+        }
+        fprintf(f,"%d",freq);
+        fclose(f);
+    }
+    free(gpath);
+    free(fpath);
+    return(EXIT_SUCCESS);
+}
+
+
diff --git a/src/access.c b/src/access.c
new file mode 100644
index 0000000..1102909
--- /dev/null
+++ b/src/access.c
@@ -0,0 +1,221 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access.c
+ *
+ *      Description:  Interface for the different register access modules.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pthread.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <configuration.h>
+#include <perfmon.h>
+#include <registers.h>
+#include <access.h>
+#include <access_client.h>
+#include <access_x86.h>
+
+
+
+static int registeredCpus = 0;
+static int registeredCpuList[MAX_NUM_THREADS] = { [0 ... (MAX_NUM_THREADS-1)] = 0 };
+
+
+static int (*access_read)(PciDeviceIndex dev, const int cpu, uint32_t reg, uint64_t *data) = NULL;
+static int (*access_write)(PciDeviceIndex dev, const int cpu, uint32_t reg, uint64_t data) = NULL;
+static int (*access_init) (int cpu_id) = NULL;
+static void (*access_finalize) (int cpu_id) = NULL;
+static int (*access_check) (PciDeviceIndex dev, int cpu_id) = NULL;
+
+void HPMmode(int mode)
+{
+    if ((mode == ACCESSMODE_DIRECT) || (mode == ACCESSMODE_DAEMON))
+    {
+        config.daemonMode = mode;
+    }
+}
+
+int HPMinit(void)
+{
+    int ret = 0;
+    if (access_init == NULL)
+    {
+#if defined(__x86_64__) || defined(__i386__)
+        if (config.daemonMode == -1)
+        {
+            config.daemonMode = ACCESSMODE_DAEMON;
+        }
+        if (config.daemonMode == ACCESSMODE_DAEMON)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for x86 architecture in daemon mode);
+            access_init = &access_client_init;
+            access_read = &access_client_read;
+            access_write = &access_client_write;
+            access_finalize = &access_client_finalize;
+            access_check = &access_client_check;
+        }
+        else if (config.daemonMode == ACCESSMODE_DIRECT)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for x86 architecture in direct mode);
+            access_init = &access_x86_init;
+            access_read = &access_x86_read;
+            access_write = &access_x86_write;
+            access_finalize = &access_x86_finalize;
+            access_check = &access_x86_check;
+        }
+#endif
+    }
+    
+    return 0;
+}
+
+
+int HPMinitialized(void)
+{
+    return registeredCpus;
+}
+
+int HPMaddThread(int cpu_id)
+{
+    int ret;
+    if (registeredCpuList[cpu_id] == 0)
+    {
+        if (access_init != NULL)
+        {
+            ret = access_init(cpu_id);
+            if (ret == 0)
+            {
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Adding CPU %d to access module, cpu_id);
+                registeredCpus++;
+                registeredCpuList[cpu_id] = 1;
+            }
+            else
+            {
+                return ret;
+            }
+        }
+        else
+        {
+            return -ENODEV;
+        }
+    }
+    return 0;
+}
+
+void HPMfinalize()
+{
+    if (registeredCpus != 0)
+    {
+        for (int i=0; i<cpuid_topology.numHWThreads; i++)
+        {
+            if (i >= cpuid_topology.numHWThreads)
+            {
+                break;
+            }
+            if (registeredCpuList[i] == 1)
+            {
+                access_finalize(i);
+                registeredCpus--;
+                registeredCpuList[i] = 0;
+            }
+        }
+    }
+    if (access_init != NULL)
+        access_init = NULL;
+    if (access_finalize != NULL)
+        access_finalize = NULL;
+    if (access_read != NULL)
+        access_read = NULL;
+    if (access_write != NULL)
+        access_write = NULL;
+    if (access_check != NULL)
+        access_check = NULL;
+    return;
+}
+
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data)
+{
+    uint64_t tmp = 0x0ULL;
+    *data = 0x0ULL;
+    int err = 0;
+    if ((dev >= MAX_NUM_PCI_DEVICES) || (data == NULL))
+    {
+        return -EFAULT;
+    }
+    if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+    {
+        return -ERANGE;
+    }
+    if (registeredCpuList[cpu_id] == 0)
+    {
+        return -ENODEV;
+    }
+    err = access_read(dev, cpu_id, reg, &tmp);
+    *data = tmp;
+    return err;
+}
+
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data)
+{
+    int err = 0;
+    if (dev >= MAX_NUM_PCI_DEVICES)
+    {
+        return -EFAULT;
+    }
+    if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+    {
+        ERROR_PRINT(MSR WRITE C %d OUT OF RANGE, cpu_id);
+        return -ERANGE;
+    }
+    if (registeredCpuList[cpu_id] == 0)
+    {
+        return -ENODEV;
+    }
+    err = access_write(dev, cpu_id, reg, data);
+    return err;
+}
+
+int HPMcheck(PciDeviceIndex dev, int cpu_id)
+{
+    if (registeredCpuList[cpu_id] == 0)
+    {
+        return -ENODEV;
+    }
+    return access_check(dev, cpu_id);
+}
diff --git a/src/accessClient.c b/src/accessClient.c
deleted file mode 100644
index ba4cb59..0000000
--- a/src/accessClient.c
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  accessClient.c
- *
- *      Description:  Implementation of client to the access daemon.
- *                   Provides API to read and write values to MSR or
- *                   PCI Cfg Adresses. This module is used by the
- *                   msr and pci modules.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <accessClient.h>
-
-int accessClient_mode = ACCESSMODE;
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-static char* accessClient_strerror(AccessErrorType det)
-{
-    switch (det)
-    {
-        case ERR_NOERROR:    return "No error";
-        case ERR_UNKNOWN:    return "unknown command";
-        case ERR_RESTREG:    return "access to this register is not allowed";
-        case ERR_OPENFAIL:   return "failed to open device file";
-        case ERR_RWFAIL:     return "failed to read/write register";
-        case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
-        case ERR_LOCKED:     return "access to HPM is locked";
-        case ERR_UNSUPPORTED: return "unsupported processor";
-        case ERR_NODEV:      return "no such device";
-        default:             return "UNKNOWN errorcode";
-    }
-}
-
-static int startDaemon(void)
-{
-    /* Check the function of the daemon here */
-    char* filepath;
-    char *newargv[] = { NULL };
-    char *newenv[] = { NULL };
-    char *exeprog = TOSTRING(ACCESSDAEMON);
-    struct sockaddr_un address;
-    size_t address_length;
-    int  ret;
-    pid_t pid;
-    int timeout = 1000;
-    int socket_fd = -1;
-
-    if (accessClient_mode == DAEMON_AM_ACCESS_D)
-    {
-        if (access(exeprog, F_OK))
-        {
-            fprintf(stderr, "Daemon '%s' cannot be found\n", exeprog);
-            exit(EXIT_FAILURE);
-        }
-        if (access(exeprog, X_OK))
-        {
-            fprintf(stderr, "Daemon '%s' not executable\n", exeprog);
-            exit(EXIT_FAILURE);
-        }
-        pid = fork();
-
-        if (pid == 0)
-        {
-            ret = execve (exeprog, newargv, newenv);
-            ERRNO_PRINT;
-            fprintf(stderr, "Failed to execute the daemon '%s' (see error above)\n", exeprog);
-            exit(EXIT_FAILURE);
-        }
-        else if (pid < 0)
-        {
-            ERROR_PLAIN_PRINT(Failed to fork);
-        }
-    }
-
-    EXIT_IF_ERROR(socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0), socket() failed);
-
-    address.sun_family = AF_LOCAL;
-    address_length = sizeof(address);
-    snprintf(address.sun_path, sizeof(address.sun_path), "/tmp/likwid-%d", pid);
-    filepath = strdup(address.sun_path);
-    DEBUG_PRINT(0, "%ssocket pathname is %s\n",
-            ((accessClient_mode == DAEMON_AM_ACCESS_D) ? "Generated " : ""),
-            filepath);
-
-    while (timeout > 0)
-    {
-        int res;
-        usleep(1000);
-        res = connect(socket_fd, (struct sockaddr *) &address, address_length);
-
-        if (res == 0)
-        {
-            break;
-        }
-
-        timeout--;
-        DEBUG_PRINT(1, "%s\n", "Still waiting for socket...");
-    }
-
-    if (timeout <= 0)
-    {
-        ERRNO_PRINT;  /* should hopefully still work, as we make no syscalls in between. */
-        fprintf(stderr, "Exiting due to timeout: The socket file at '%s' \
-                could not be opened within 10 seconds.\n", filepath);
-        fprintf(stderr, "Consult the error message above this to find out why.\n");
-        fprintf(stderr, "If the error is 'no such file or directoy', \
-                it usually means that likwid-accessD just failed to start.\n");
-        fprintf(stderr, "In case the daemon itself output an error', \
-                ignore this.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    DEBUG_PRINT(0, "%s\n", "Successfully opened socket to daemon.");
-    free(filepath);
-
-    return socket_fd;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void accessClient_setaccessmode(int mode)
-{
-    if ((accessClient_mode > DAEMON_AM_ACCESS_D) || (accessClient_mode < DAEMON_AM_DIRECT))
-    {
-        fprintf(stderr, "Invalid accessmode %d\n", accessClient_mode);
-        exit(EXIT_FAILURE);
-    }
-
-    accessClient_mode = mode;
-}
-
-void accessClient_init(int* socket_fd)
-{
-    if ((accessClient_mode == DAEMON_AM_ACCESS_D))
-    {
-        (*socket_fd) = startDaemon();
-    }
-}
-
-void accessClient_finalize(int socket_fd)
-{
-    if ( socket_fd != -1 )
-    { /* Only if a socket is actually open */
-        AccessDataRecord data;
-        data.type = DAEMON_EXIT;
-        CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)),socket write failed);
-        CHECK_ERROR(close(socket_fd),socket close failed);
-    }
-}
-
-
-uint64_t accessClient_read(
-        int socket_fd,
-        const int cpu,
-        const int device,
-        uint32_t reg)
-{
-    AccessDataRecord data;
-
-    data.cpu = cpu;
-    data.reg = reg;
-    data.data = 0x00;
-    data.type = DAEMON_READ;
-    data.device = device;
-
-    CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)), socket write failed);
-    CHECK_ERROR(read(socket_fd, &data, sizeof(AccessDataRecord)), socket read failed);
-
-    if (data.errorcode != ERR_NOERROR)
-    {
-        fprintf(stderr, "Failed to read data through daemon: "
-                "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
-                data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        //exit(EXIT_FAILURE);
-    }
-
-    return data.data;
-}
-
-void accessClient_write(
-        int socket_fd,
-        const int cpu,
-        const int device,
-        uint32_t reg,
-        uint64_t sdata)
-{
-    AccessDataRecord data;
-
-    data.cpu = cpu;
-    data.reg = reg;
-    data.data = sdata;
-    data.type = DAEMON_WRITE;
-    data.device = device;
-    CHECK_ERROR(write(socket_fd, &data, sizeof(AccessDataRecord)), socket write failed);
-    CHECK_ERROR(read(socket_fd, &data, sizeof(AccessDataRecord)), socket read failed);
-
-    if (data.errorcode != ERR_NOERROR)
-    {
-        fprintf(stderr, "Failed to write data through daemon: "
-                "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
-                data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        //exit(EXIT_FAILURE);
-    }
-
-    if (data.data != 0x00ULL)
-    {
-        ERROR_PLAIN_PRINT(daemon write failed);
-    }
-}
-
-
diff --git a/src/access_client.c b/src/access_client.c
new file mode 100644
index 0000000..93623f0
--- /dev/null
+++ b/src/access_client.c
@@ -0,0 +1,343 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pthread.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access.h>
+#include <access_client.h>
+#include <configuration.h>
+#include <affinity.h>
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+static int globalSocket = -1;
+static int cpuSockets_open = 0;
+static int cpuSockets[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1};
+static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t cpuLocks[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = PTHREAD_MUTEX_INITIALIZER };
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static char*
+access_client_strerror(AccessErrorType det)
+{
+    switch (det)
+    {
+        case ERR_NOERROR:    return "No error";
+        case ERR_UNKNOWN:    return "unknown command";
+        case ERR_RESTREG:    return "access to this register is not allowed";
+        case ERR_OPENFAIL:   return "failed to open device file";
+        case ERR_RWFAIL:     return "failed to read/write register";
+        case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
+        case ERR_NODEV:      return "no such pci device";
+        default:             return "UNKNOWN errorcode";
+    }
+}
+
+static int
+access_client_errno(AccessErrorType det)
+{
+    switch (det)
+    {
+        case ERR_NOERROR:    return 0;
+        case ERR_UNKNOWN:    return -EFAULT;
+        case ERR_RESTREG:    return -EPERM;
+        case ERR_OPENFAIL:   return -ENXIO;
+        case ERR_RWFAIL:     return -EIO;
+        case ERR_DAEMONBUSY: return -EBUSY;
+        case ERR_NODEV:      return -ENODEV;
+        default:             return -EFAULT;
+    }
+}
+
+static int
+access_client_startDaemon(int cpu_id)
+{
+    /* Check the function of the daemon here */
+    char* filepath;
+    char *newargv[] = { NULL };
+    char *newenv[] = { NULL };
+    char *safeexeprog = TOSTRING(ACCESSDAEMON);
+    char exeprog[1024];
+    struct sockaddr_un address;
+    size_t address_length;
+    int  ret;
+    pid_t pid;
+    int timeout = 1000;
+    int socket_fd = -1;
+
+    if (config.daemonPath != NULL)
+    {
+        strcpy(exeprog, config.daemonPath);
+    }
+    else
+    {
+        strcpy(exeprog, safeexeprog);
+    }
+
+    if (access(exeprog, X_OK))
+    {
+        ERROR_PRINT(Failed to find the daemon '%s'\n, exeprog);
+        exit(EXIT_FAILURE);
+    }
+
+    pid = fork();
+
+    if (pid == 0)
+    {
+        if (cpu_id >= 0)
+        {
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(cpu_id, &cpuset);
+            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+        }
+        ret = execve (exeprog, newargv, newenv);
+
+        if (ret < 0)
+        {
+            //ERRNO_PRINT;
+            ERROR_PRINT(Failed to execute the daemon '%s'\n, exeprog);
+            exit(EXIT_FAILURE);
+        }
+    }
+    else if (pid < 0)
+    {
+        ERROR_PLAIN_PRINT(Failed to fork);
+    }
+
+    EXIT_IF_ERROR(socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0), socket() failed);
+
+    address.sun_family = AF_LOCAL;
+    address_length = sizeof(address);
+    snprintf(address.sun_path, sizeof(address.sun_path), "/tmp/likwid-%d", pid);
+    filepath = strdup(address.sun_path);
+
+    while (timeout > 0)
+    {
+        int res;
+        usleep(1000);
+        res = connect(socket_fd, (struct sockaddr *) &address, address_length);
+
+        if (res == 0)
+        {
+            break;
+        }
+
+        timeout--;
+        DEBUG_PRINT(DEBUGLEV_INFO, Still waiting for socket %s ..., filepath);
+    }
+    
+    if (timeout <= 0)
+    {
+        ERRNO_PRINT;  /* should hopefully still work, as we make no syscalls in between. */
+        fprintf(stderr, "Exiting due to timeout: The socket file at '%s' \
+                could not be opened within 10 seconds.\n", filepath);
+        fprintf(stderr, "Consult the error message above this to find out why.\n");
+        fprintf(stderr, "If the error is 'no such file or directoy', \
+                it usually means that likwid-accessD just failed to start.\n");
+        exit(EXIT_FAILURE);
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened socket %s to daemon for CPU %d, filepath, cpu_id);
+    free(filepath);
+
+    return socket_fd;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int access_client_init(int cpu_id)
+{
+    int ret = 0;
+    if (cpuSockets[cpu_id] < 0)
+    {
+        pthread_mutex_lock(&cpuLocks[cpu_id]);
+        cpuSockets[cpu_id] = access_client_startDaemon(cpu_id);
+        cpuSockets_open++;
+        pthread_mutex_unlock(&cpuLocks[cpu_id]);
+        if (globalSocket == -1)
+        {
+            pthread_mutex_lock(&globalLock);
+            globalSocket = cpuSockets[cpu_id];
+            pthread_mutex_unlock(&globalLock);
+        }
+    }
+    return ret;
+}
+
+int access_client_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data)
+{
+    int ret;
+    int socket = globalSocket;
+    pthread_mutex_t* lockptr = &globalLock;
+    AccessDataRecord record;
+    record.cpu = cpu_id;
+    record.device = MSR_DEV;
+
+    if (cpuSockets_open == 0)
+    {
+        return -ENOENT;
+    }
+
+    if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != globalSocket))
+    {
+        socket = cpuSockets[cpu_id];
+        lockptr = &cpuLocks[cpu_id];
+    }
+
+    if (dev != MSR_DEV)
+    {
+        record.cpu = affinity_core2node_lookup[cpu_id];
+        record.device = dev;
+    }
+    if (socket != -1)
+    {
+        record.reg = reg;
+        record.data = 0x00;
+        record.type = DAEMON_READ;
+
+        pthread_mutex_lock(lockptr);
+        CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+        CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+        *data = record.data;
+        pthread_mutex_unlock(lockptr);
+
+        if (record.errorcode != ERR_NOERROR)
+        {
+            if (dev == MSR_DEV)
+            {
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon reading reg 0x%X at CPU %d,
+                            access_client_strerror(record.errorcode), reg, cpu_id);
+            }
+            else
+            {
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon reading reg 0x%X on socket %d,
+                            access_client_strerror(record.errorcode), reg, cpu_id);
+            }
+            *data = 0;
+            return access_client_errno(record.errorcode);
+        }
+    }
+    else
+    {
+        *data = 0;
+        return -EBADFD;
+    }
+    return 0;
+}
+
+int access_client_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data)
+{
+    int socket = globalSocket;
+    int ret;
+    AccessDataRecord record;
+    record.cpu = cpu_id;
+    record.device = MSR_DEV;
+    pthread_mutex_t* lockptr = &globalLock;
+
+    if (cpuSockets_open == 0)
+    {
+        return -ENOENT;
+    }
+
+    if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != socket))
+    {
+        socket = cpuSockets[cpu_id];
+        lockptr = &cpuLocks[cpu_id];
+    }
+
+    if (dev != MSR_DEV)
+    {
+        record.cpu = affinity_core2node_lookup[cpu_id];
+        record.device = dev;
+    }
+    if (socket != -1)
+    {
+        record.reg = reg;
+        record.data = data;
+        record.type = DAEMON_WRITE;
+
+        pthread_mutex_lock(lockptr);
+        CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+        CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+        pthread_mutex_unlock(lockptr);
+
+        if (record.errorcode != ERR_NOERROR)
+        {
+            if (dev == MSR_DEV)
+            {
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon writing reg 0x%X at CPU %d,
+                            access_client_strerror(record.errorcode), reg, cpu_id);
+            }
+            else
+            {
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon writing reg 0x%X on socket %d,
+                            access_client_strerror(record.errorcode), reg, cpu_id);
+            }
+            return access_client_errno(record.errorcode);
+        }
+    }
+    else
+    {
+        return -EBADFD;
+    }
+    return 0;
+}
+
+void access_client_finalize(int cpu_id)
+{
+    AccessDataRecord record;
+    if (cpuSockets[cpu_id] > 0)
+    {
+        record.type = DAEMON_EXIT;
+        CHECK_ERROR(write(cpuSockets[cpu_id], &record, sizeof(AccessDataRecord)),socket write failed);
+        CHECK_ERROR(close(cpuSockets[cpu_id]),socket close failed);
+        cpuSockets[cpu_id] = -1;
+        cpuSockets_open--;
+    }
+    if (cpuSockets_open == 0)
+    {
+        globalSocket = -1;
+    }
+}
+
+int access_client_check(PciDeviceIndex dev, int cpu_id)
+{
+    int socket = globalSocket;
+    pthread_mutex_t* lockptr = &globalLock;
+
+    AccessDataRecord record;
+    record.cpu = cpu_id;
+    record.device = dev;
+    record.type = DAEMON_CHECK;
+    if (dev != MSR_DEV)
+    {
+        record.cpu = affinity_core2node_lookup[cpu_id];
+    }
+    if ((cpuSockets[cpu_id] > 0) && (cpuSockets[cpu_id] != globalSocket))
+    {
+        socket = cpuSockets[cpu_id];
+        lockptr = &cpuLocks[cpu_id];
+    }
+    if ((cpuSockets[cpu_id] > 0) || ((cpuSockets_open == 1) && (globalSocket > 0)))
+    {
+        pthread_mutex_lock(lockptr);
+        CHECK_ERROR(write(socket, &record, sizeof(AccessDataRecord)), socket write failed);
+        CHECK_ERROR(read(socket, &record, sizeof(AccessDataRecord)), socket read failed);
+        pthread_mutex_unlock(lockptr);
+        if (record.errorcode == ERR_NOERROR )
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/access_x86.c b/src/access_x86.c
new file mode 100644
index 0000000..4cda3a7
--- /dev/null
+++ b/src/access_x86.c
@@ -0,0 +1,91 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access.h>
+#include <access_x86.h>
+#include <access_x86_msr.h>
+#include <access_x86_pci.h>
+#include <affinity.h>
+
+
+
+int access_x86_init(int cpu_id)
+{
+    int ret = access_x86_msr_init(cpu_id);
+    if (ret == 0)
+    {
+        if (cpuid_info.supportUncore)
+        {
+            ret = access_x86_pci_init(affinity_core2node_lookup[cpu_id]);
+        }
+    }
+    return ret;
+}
+
+int access_x86_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data)
+{
+    int err;
+    uint64_t tmp = 0x0ULL;
+    if (dev == MSR_DEV)
+    {
+        err = access_x86_msr_read(cpu_id, reg, &tmp);
+        *data = tmp;
+    }
+    else
+    {
+        if (access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]))
+        {
+            err = access_x86_pci_read(dev, affinity_core2node_lookup[cpu_id], reg, &tmp);
+            *data = tmp;
+        }
+    }
+    return err;
+}
+
+int access_x86_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data)
+{
+    int err;
+    if (dev == MSR_DEV)
+    {
+        err = access_x86_msr_write(cpu_id, reg, data);
+    }
+    else
+    {
+        if (access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]))
+        {
+            err = access_x86_pci_write(dev, affinity_core2node_lookup[cpu_id], reg, data);
+        }
+    }
+    return err;
+}
+
+void access_x86_finalize(int cpu_id)
+{
+    access_x86_msr_finalize(cpu_id);
+    if (cpuid_info.supportUncore)
+    {
+        access_x86_pci_finalize(affinity_core2node_lookup[cpu_id]);
+    }
+}
+
+int access_x86_check(PciDeviceIndex dev, int cpu_id)
+{
+    if (dev == MSR_DEV)
+    {
+        return access_x86_msr_check(dev, cpu_id);
+    }
+    else
+    {
+        return access_x86_pci_check(dev, affinity_core2node_lookup[cpu_id]);
+    }
+    return 0;
+}
diff --git a/src/access_x86_msr.c b/src/access_x86_msr.c
new file mode 100644
index 0000000..08a082d
--- /dev/null
+++ b/src/access_x86_msr.c
@@ -0,0 +1,288 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_x86_msr.c
+ *
+ *      Description:  Implementation of msr module.
+ *                   Provides API to read and write values to the model
+ *                   specific registers on x86 processors using the msr
+ *                   sys interface of the Linux 2.6 kernel. This module
+ *                   is based on the msr-util tools.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access_x86_msr.h>
+#include <registers.h>
+#ifdef LIKWID_PROFILE_COUNTER_READ
+#include <timer.h>
+#endif
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+#define MAX_LENGTH_MSR_DEV_NAME  20
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+static int FD[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1 };
+static int rdpmc_works_pmc = -1;
+static int rdpmc_works_fixed = -1;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+static inline int __rdpmc(int cpu_id, int counter, uint64_t* value)
+{
+    unsigned low, high;
+    cpu_set_t cpuset, current;
+    sched_getaffinity(0, sizeof(cpu_set_t), &current);
+    CPU_ZERO(&cpuset);
+    CPU_SET(cpu_id, &cpuset);
+    sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+    __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
+    *value = ((low) | ((uint64_t )(high) << 32));
+    sched_setaffinity(0, sizeof(cpu_set_t), &current);
+    return 0;
+}
+
+//Needed for rdpmc check
+void segfault_sigaction(int signal, siginfo_t *si, void *arg)
+{
+    exit(1);
+}
+
+int test_rdpmc(int cpu_id, uint64_t value, int flag)
+{
+    int ret;
+    int pid;
+
+
+    pid = fork();
+
+    if (pid < 0)
+    {
+        return -1;
+    }
+    if (!pid)
+    {
+        uint64_t tmp;
+        struct sigaction sa;
+        memset(&sa, 0, sizeof(struct sigaction));
+        sigemptyset(&sa.sa_mask);
+        sa.sa_sigaction = segfault_sigaction;
+        sa.sa_flags   = SA_SIGINFO;
+        sigaction(SIGSEGV, &sa, NULL);
+        if (flag == 0)
+        {
+            __rdpmc(cpu_id, value, &tmp);
+            usleep(100);
+        }
+        exit(0);
+    } else {
+        int status = 0;
+        int waiting = 0;
+        waiting = waitpid(pid, &status, 0);
+        if ((waiting < 0) || (WEXITSTATUS(status) != 0))
+        {
+            ret = 0;
+        } else 
+        {
+            ret = 1;
+        }
+    }
+    return ret;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+
+int
+access_x86_msr_init(const int cpu_id)
+{
+    int fd = 0;
+    int i = 0;
+
+    char* msr_file_name;
+    if (FD[cpu_id] > 0)
+    {
+        return 0;
+    }
+    msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
+    if (!msr_file_name)
+    {    
+        return -ENOMEM;
+    }
+
+    sprintf(msr_file_name,"/dev/msr%d", cpu_id);
+    fd = open(msr_file_name, O_RDWR);
+    if (fd < 0)
+    {
+        sprintf(msr_file_name,"/dev/cpu/%d/msr", cpu_id);
+    }
+    else
+    {
+        close(fd);
+    }
+    fd = open(msr_file_name, O_RDWR);
+    if (fd < 0)
+    {
+        ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno))
+        ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions);
+        ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode);
+        free(msr_file_name);
+        return -EPERM;
+    }
+    else
+    {
+        close(fd);
+    }
+    if (rdpmc_works_pmc < 0)
+    {
+        rdpmc_works_pmc = test_rdpmc(cpu_id, 0, 0);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for PMC counters returned %d, rdpmc_works_pmc);
+    }
+    if (rdpmc_works_fixed < 0)
+    {
+        rdpmc_works_fixed = test_rdpmc(cpu_id, (1<<30), 0);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED counters returned %d, rdpmc_works_fixed);
+    }
+
+    sprintf(msr_file_name,"/dev/msr%d",cpu_id);
+    fd = open(msr_file_name, O_RDWR); 
+    if (fd < 0)
+    {
+        sprintf(msr_file_name,"/dev/cpu/%d/msr",cpu_id);
+    }
+    else
+    {
+        close(fd);
+    }
+    FD[cpu_id] = open(msr_file_name, O_RDWR);
+    if ( FD[cpu_id] < 0 )
+    {
+        ERROR_PRINT(Cannot access MSR device file %s in direct mode, msr_file_name);
+        free(msr_file_name);
+        return -EPERM;
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Opened MSR device %s for CPU %d,msr_file_name, cpu_id);
+    free(msr_file_name);
+
+    return 0;
+}
+
+void
+access_x86_msr_finalize(const int cpu_id)
+{
+    int i = 0;
+
+    if (FD[cpu_id] > 0)
+    {
+        close(FD[cpu_id]);
+        FD[cpu_id] = 0;
+    }
+}
+
+
+int
+access_x86_msr_read( const int cpu_id, uint32_t reg, uint64_t *data)
+{
+    int ret;
+
+    if ((rdpmc_works_pmc == 1) && (reg >= MSR_PMC0) && (reg <=MSR_PMC7))
+    {
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Read PMC counter with RDPMC instruction with index %d, reg - MSR_PMC0);
+        if (__rdpmc(cpu_id, reg - MSR_PMC0, data) )
+        {
+            rdpmc_works_pmc = 0;
+            goto fallback;
+        }
+    }
+    else if ((rdpmc_works_fixed == 1) && (reg >= MSR_PERF_FIXED_CTR0) && (reg <= MSR_PERF_FIXED_CTR2))
+    {
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Read FIXED counter with RDPMC instruction with index %d, (1<<30) + (reg - MSR_PERF_FIXED_CTR0));
+        if (__rdpmc(cpu_id, (1<<30) + (reg - MSR_PERF_FIXED_CTR0), data) )
+        {
+            rdpmc_works_fixed = 0;
+            goto fallback;
+        }
+    }
+    else
+    {
+fallback:
+        if (FD[cpu_id] > 0)
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Read MSR counter 0x%X with RDMSR instruction on CPU %d, reg, cpu_id);
+            ret = pread(FD[cpu_id], data, sizeof(*data), reg);
+            if ( ret != sizeof(*data) )
+            {
+                return ret;
+            }
+        }
+    }
+    return 0;
+}
+
+int
+access_x86_msr_write( const int cpu_id, uint32_t reg, uint64_t data)
+{
+    int ret;
+    if (FD[cpu_id] > 0)
+    {
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Write MSR counter 0x%X with WRMSR instruction on CPU %d data 0x%X, reg, cpu_id, data);
+        ret = pwrite(FD[cpu_id], &data, sizeof(data), reg);
+        if (ret != sizeof(data))
+        {
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int access_x86_msr_check(PciDeviceIndex dev, int cpu_id)
+{
+    if (dev == MSR_DEV)
+    {
+        if (FD[cpu_id] > 0)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/access_x86_pci.c b/src/access_x86_pci.c
new file mode 100644
index 0000000..c96f775
--- /dev/null
+++ b/src/access_x86_pci.c
@@ -0,0 +1,313 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci.c
+ *
+ *      Description:  Implementation of pci module.
+ *                   Provides API to read and write values to the hardware
+ *                   performance monitoring registers in PCI Cfg space
+ *                   for Intel Sandy Bridge Processors.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <types.h>
+#include <bstrlib.h>
+#include <error.h>
+#include <topology.h>
+
+#include <access_x86_pci.h>
+
+#ifdef LIKWID_USE_HWLOC
+#include <pci_hwloc.h>
+#else
+#include <pci_proc.h>
+#endif
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#define PCI_ROOT_PATH  "/proc/bus/pci/"
+#define PCM_PCI_CLASS  0x1101
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static int FD[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
+static int access_x86_initialized = 0;
+static int nr_sockets = 0;
+
+/* Socket to bus mapping -- will be determined at runtime;
+ * typical mappings are:
+ * Socket  Bus (2S)  Bus (4s)
+ *   0        0xff      0x3f
+ *   1        0x7f      0x7f
+ *   2                  0xbf
+ *   3                  0xff
+ */
+static char* socket_bus[MAX_NUM_NODES] = { [0 ... (MAX_NUM_NODES-1)] = "N-A"};
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+/* Dirty hack to avoid nonull warnings */
+int (*ownaccess)(const char*, int);
+int (*ownopen)(const char*, int, ...);
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int
+access_x86_pci_init(const int socket)
+{
+    int ret = 0;
+
+
+    if (access_x86_initialized == 0)
+    {
+        uint16_t testDevice;
+        ownaccess = &access;
+        ownopen = &open;
+
+        /* PCI is only provided by Intel systems */
+        if (!cpuid_info.isIntel)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, PCI based Uncore performance monitoring only supported on Intel systems);
+            return -ENODEV;
+        }
+        switch (cpuid_info.model)
+        {
+            case SANDYBRIDGE_EP:
+                testDevice = 0x3c44;
+                break;
+            case IVYBRIDGE_EP:
+                testDevice = 0x0e36;
+                break;
+            case HASWELL_EP:
+                testDevice = 0x2f30;
+                break;
+            case BROADWELL_D:
+                testDevice = 0x6f30;
+                break;
+            default:
+                DEBUG_PRINT(DEBUGLEV_INFO,CPU model %s does not support PCI based Uncore performance monitoring, cpuid_info.name);
+                return -ENODEV;
+                break;
+        }
+        if(geteuid() != 0)
+        {
+            fprintf(stderr, "WARNING\n");
+            fprintf(stderr, "Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
+            fprintf(stderr, "This means you can use performance groups as MEM only as root in direct mode.\n");
+            fprintf(stderr, "Alternatively you might want to look into (sys)daemonmode.\n\n");
+            return -EPERM;
+        }
+
+        for(int i=0; i<MAX_NUM_NODES; i++)
+        {
+            for(int j=1;j<MAX_NUM_PCI_DEVICES;j++)
+            {
+                FD[i][j] = -2;
+            }
+        }
+
+#ifdef LIKWID_USE_HWLOC
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using hwloc to find pci devices);
+        ret = hwloc_pci_init(testDevice, socket_bus, &nr_sockets);
+        if (ret)
+        {
+            ERROR_PLAIN_PRINT(Using hwloc to find pci devices failed);
+            return -ENODEV;
+        }
+#else
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using procfs to find pci devices);
+        ret = proc_pci_init(testDevice, socket_bus, &nr_sockets);
+        if (ret)
+        {
+            ERROR_PLAIN_PRINT(Using procfs to find pci devices failed);
+            return -ENODEV;
+        }
+#endif
+    }
+
+
+    for(int j=1;j<MAX_NUM_PCI_DEVICES;j++)
+    {
+        if ((pci_devices[j].path != NULL) && (FD[socket][j] == -2))
+        {
+            bstring filepath = bformat("%s%s%s",PCI_ROOT_PATH,
+                                                socket_bus[socket],
+                                                pci_devices[j].path);
+            if (!ownaccess(bdata(filepath),F_OK))
+            {
+                FD[socket][j] = 0;
+                pci_devices[j].online = 1;
+                if (access_x86_initialized == 0)
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, PCI device %s (%d) online for socket %d at path %s, pci_devices[j].name,j, socket,bdata(filepath));
+                    if (ownaccess(bdata(filepath),R_OK|W_OK))
+                    {
+                        ERROR_PRINT(PCI device %s (%d) online for socket %d at path %s but not accessible, pci_devices[j].name,j, socket,bdata(filepath));
+                    }
+                }
+            }
+            else
+            {
+                pci_devices[j].online = 0;
+            }
+        }
+    }
+
+    access_x86_initialized = 1;
+    return 0;
+}
+
+
+void
+access_x86_pci_finalize(const int socket)
+{
+    for (int j=1; j<MAX_NUM_PCI_DEVICES; j++)
+    {
+        if (FD[socket][j] > 0)
+        {
+            close(FD[socket][j]);
+        }
+    }
+}
+
+
+int
+access_x86_pci_read(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t *data)
+{
+    bstring filepath = NULL;
+    uint32_t tmp;
+    int err;
+
+    if (dev == MSR_DEV)
+    {
+        return -ENODEV;
+    }
+
+    if (FD[socket][dev] < 0)
+    {
+        *data = 0ULL;
+        return -ENODEV;
+    }
+    else if ( !FD[socket][dev] )
+    {
+        filepath =  bfromcstr ( PCI_ROOT_PATH );
+        bcatcstr(filepath, socket_bus[socket]);
+        bcatcstr(filepath, pci_devices[dev].path);
+        FD[socket][dev] = ownopen( bdata(filepath), O_RDWR);
+
+        if ( FD[socket][dev] < 0)
+        {
+            ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                            pci_devices[dev].name,
+                            bdata(filepath));
+            *data = 0ULL;
+            return -EACCES;
+        }
+        DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[dev].name);
+    }
+
+    if ( FD[socket][dev] > 0 &&
+         pread(FD[socket][dev], &tmp, sizeof(tmp), reg) != sizeof(tmp) ) 
+    {
+        ERROR_PRINT(Read from PCI device %s at register 0x%x failed, pci_devices[dev].name, reg);
+        *data = 0ULL;
+        return -EIO;
+    }
+    *data = (uint64_t)tmp;
+    return 0;
+}
+
+
+
+int
+access_x86_pci_write(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t data)
+{
+    bstring filepath = NULL;
+    int err;
+    uint32_t tmp = (uint32_t)data;
+
+    if (dev == MSR_DEV)
+    {
+        return -ENODEV;
+    }
+    if (FD[socket][dev] < 0)
+    {
+        return -ENODEV;
+    }
+    else if ( !FD[socket][dev] )
+    {
+        filepath = bfromcstr ( PCI_ROOT_PATH );
+        bcatcstr(filepath, socket_bus[socket]);
+        bcatcstr(filepath, pci_devices[dev].path );
+        
+        FD[socket][dev] = ownopen( bdata(filepath), O_RDWR);
+
+        if ( FD[socket][dev] < 0)
+        {
+            ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                                pci_devices[dev].name,
+                                bdata(filepath));
+            return -EACCES;
+        }
+        DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[dev].name);
+    }
+
+    if ( FD[socket][dev] > 0 &&
+         pwrite(FD[socket][dev], &tmp, sizeof tmp, reg) != sizeof tmp)
+    {
+        ERROR_PRINT(Write to PCI device %s at register 0x%x failed, pci_devices[dev].name, reg);
+        return -EIO;
+    }
+    return 0;
+}
+
+int access_x86_pci_check(PciDeviceIndex dev, int socket)
+{
+    if (dev == MSR_DEV)
+    {
+        return 1;
+    }
+    else if ((pci_devices[dev].online == 1) || (FD[socket][dev] >= 0))
+    {
+        return 1;
+    }
+    return 0;
+}
+
diff --git a/src/affinity.c b/src/affinity.c
index 59b05da..40f9e83 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Implementation of affinity module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +34,6 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include <math.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -42,13 +42,15 @@
 #include <sched.h>
 #include <time.h>
 #include <pthread.h>
+#include <math.h>
 
-#include <error.h>
 #include <types.h>
+#include <error.h>
+#include <likwid.h>
 #include <numa.h>
 #include <affinity.h>
-#include <cpuid.h>
 #include <tree.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
@@ -63,6 +65,9 @@ int affinity_core2node_lookup[MAX_NUM_THREADS];
 
 static int  affinity_numberOfDomains = 0;
 static AffinityDomain*  domains;
+static int affinity_initialized = 0;
+
+AffinityDomains affinityDomains;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
@@ -81,7 +86,7 @@ getProcessorID(cpu_set_t* cpu_set)
     return processorId;
 }
 
-static void
+static int
 treeFillNextEntries(
     TreeNode* tree,
     int* processorIds,
@@ -101,8 +106,7 @@ treeFillNextEntries(
 
         if ( node == NULL )
         {
-          printf("ERROR: Socket %d not existing!",i);
-          exit(EXIT_FAILURE);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find socket %d in topology tree, i);
         }
     }
 
@@ -114,10 +118,10 @@ treeFillNextEntries(
 
         if ( node == NULL )
         {
-          printf("ERROR: Core %d on socket %d not existing!",i,socketId);
-          exit(EXIT_FAILURE);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find core %d in topology tree, i);
         }
     }
+
     /* Traverse horizontal */
     while ( node != NULL )
     {
@@ -127,12 +131,20 @@ treeFillNextEntries(
 
         while ( thread != NULL )
         {
-            processorIds[numberOfEntries-counter] = thread->id;
-            thread = tree_getNextNode(thread);
-            counter--;
+            if (cpuid_topology.threadPool[thread->id].inCpuSet)
+            {
+                processorIds[numberOfEntries-counter] = thread->id;
+                thread = tree_getNextNode(thread);
+                counter--;
+            }
+            else
+            {
+                thread = tree_getNextNode(thread);
+            }
         }
         node = tree_getNextNode(node);
     }
+    return numberOfEntries-counter;
 }
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
@@ -144,149 +156,250 @@ affinity_init()
     int currentDomain;
     int subCounter = 0;
     int offset = 0;
-    int numberOfSocketDomains = cpuid_topology.numSockets;;
+    int tmp;
+    if (affinity_initialized == 1)
+    {
+        return;
+    }
+    topology_init();
+    int numberOfSocketDomains = cpuid_topology.numSockets;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Socket domains %d, numberOfSocketDomains);
+    numa_init();
     int numberOfNumaDomains = numa_info.numberOfNodes;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: NUMA domains %d, numberOfNumaDomains);
     int numberOfProcessorsPerSocket =
         cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per socket %d, numberOfProcessorsPerSocket);
     int numberOfCacheDomains;
 
     int numberOfCoresPerCache =
         cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads/
         cpuid_topology.numThreadsPerCore;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPU cores per LLC %d, numberOfCoresPerCache);
 
     int numberOfProcessorsPerCache =
         cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads;
-
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per LLC %d, numberOfProcessorsPerCache);
     /* for the cache domain take only into account last level cache and assume
      * all sockets to be uniform. */
 
     /* determine how many last level shared caches exist per socket */
     numberOfCacheDomains = cpuid_topology.numSockets *
         (cpuid_topology.numCoresPerSocket/numberOfCoresPerCache);
-
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Cache domains %d, numberOfCacheDomains);
     /* determine total number of domains */
-    if ( numberOfNumaDomains > 1 )
-    {
-        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
-    }
-    else
-    {
-        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains;
-    }
+    numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: All domains %d, numberOfDomains);
     domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain));
     if (!domains)
     {
-        fprintf(stderr, "Cannot allocate affinity domain memory\n");
+        fprintf(stderr,"No more memory for %ld bytes for array of affinity domains\n",numberOfDomains * sizeof(AffinityDomain));
         return;
     }
 
     /* Node domain */
-    domains[0].numberOfProcessors = cpuid_topology.numHWThreads;
+    domains[0].numberOfProcessors = cpuid_topology.activeHWThreads;
     domains[0].numberOfCores = cpuid_topology.numSockets * cpuid_topology.numCoresPerSocket;
-    domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain N: %d HW threads on %d cores, domains[0].numberOfProcessors, domains[0].numberOfCores);
     domains[0].tag = bformat("N");
+    domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
+    if (!domains[0].processorList)
+    {
+        fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                cpuid_topology.numHWThreads*sizeof(int), 
+                bdata(domains[0].tag));
+        return;
+    }
     offset = 0;
 
-    for (int i=0; i<numberOfSocketDomains; i++)
+    if (numberOfSocketDomains > 1)
     {
-      treeFillNextEntries(
-          cpuid_topology.topologyTree,
-          domains[0].processorList + offset,
-          i, 0, numberOfProcessorsPerSocket);
-
-      offset += numberOfProcessorsPerSocket;
+        for (int i=0; i<numberOfSocketDomains; i++)
+        {
+          tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                    domains[0].processorList + offset,
+                                    i, 0, numberOfProcessorsPerSocket);
+          offset += tmp;
+        }
+    }
+    else
+    {
+        tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                  domains[0].processorList,
+                                  0, 0, domains[0].numberOfProcessors);
+        domains[0].numberOfProcessors = tmp;
     }
 
     /* Socket domains */
     currentDomain = 1;
-
     for (int i=0; i < numberOfSocketDomains; i++ )
     {
-      domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
-      domains[currentDomain + i].numberOfCores =  cpuid_topology.numCoresPerSocket;
-      domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
-      domains[currentDomain + i].tag = bformat("S%d", i);
-
-      treeFillNextEntries(
-          cpuid_topology.topologyTree,
-          domains[currentDomain + i].processorList,
-          i, 0, domains[currentDomain + i].numberOfProcessors);
+        domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
+        domains[currentDomain + i].numberOfCores =  cpuid_topology.numCoresPerSocket;
+        domains[currentDomain + i].tag = bformat("S%d", i);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain S%d: %d HW threads on %d cores, i, domains[currentDomain + i].numberOfProcessors, domains[currentDomain + i].numberOfCores);
+        domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
+        if (!domains[currentDomain + i].processorList)
+        {
+            fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                    domains[currentDomain + i].numberOfProcessors * sizeof(int),
+                    bdata(domains[currentDomain + i].tag));
+            return;
+        }
+
+        tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                  domains[currentDomain + i].processorList,
+                                  i, 0, domains[currentDomain + i].numberOfProcessors);
+        tmp = MIN(tmp, domains[currentDomain + i].numberOfProcessors);
+        for ( int j = 0; j < tmp; j++ )
+        {
+            affinity_core2node_lookup[domains[currentDomain + i].processorList[j]] = i;
+        }
+        domains[currentDomain + i].numberOfProcessors = tmp;
     }
 
     /* Cache domains */
     currentDomain += numberOfSocketDomains;
     subCounter = 0;
-
     for (int i=0; i < numberOfSocketDomains; i++ )
     {
-      offset = 0;
-
-      for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
-      {
-        domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
-        domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
-        domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
-        domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
-
-        treeFillNextEntries(
-            cpuid_topology.topologyTree,
-            domains[currentDomain + subCounter].processorList,
-            i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
-        offset += numberOfCoresPerCache;
-        subCounter++;
-      }
-    }
+        offset = 0;
 
-    if ( numberOfNumaDomains > 1 )
-    {
-        /* Memory domains */
-        currentDomain += numberOfCacheDomains;
-        subCounter = 0;
+        for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
+        {
+            domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
+            domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
+            domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain C%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+            domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
+            if (!domains[currentDomain + subCounter].processorList)   
+            {
+                fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                        numberOfProcessorsPerCache*sizeof(int),
+                        bdata(domains[currentDomain + subCounter].tag));
+                return;
+            }
 
+            tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                      domains[currentDomain + subCounter].processorList,
+                                      i, offset,
+                                      domains[currentDomain + subCounter].numberOfProcessors);
+            domains[currentDomain + subCounter].numberOfProcessors = tmp;
+            offset += (tmp < numberOfCoresPerCache ? tmp : numberOfCoresPerCache);
+            subCounter++;
+        }
+    }
+    /* Memory domains */
+    currentDomain += numberOfCacheDomains;
+    subCounter = 0;
+    if ((numberOfNumaDomains >= numberOfSocketDomains) && (numberOfNumaDomains > 1))
+    {
         for (int i=0; i < numberOfSocketDomains; i++ )
         {
             offset = 0;
-            for ( int j=0; j < (int)ceil((double)numberOfNumaDomains/numberOfSocketDomains); j++ )
+            for ( int j=0; j < (int)ceil((double)(numberOfNumaDomains)/numberOfSocketDomains); j++ )
             {
-                domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors;
-                domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
-                domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+                domains[currentDomain + subCounter].numberOfProcessors =
+                                numa_info.nodes[subCounter].numberOfProcessors;
+                domains[currentDomain + subCounter].numberOfCores =
+                                numa_info.nodes[subCounter].numberOfProcessors/cpuid_topology.numThreadsPerCore;
                 domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
-
-                treeFillNextEntries(
-                        cpuid_topology.topologyTree,
-                        domains[currentDomain + subCounter].processorList,
-                        i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain M%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+                domains[currentDomain + subCounter].processorList =
+                                (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+                if (!domains[currentDomain + subCounter].processorList)
+                {
+                    fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                            numa_info.nodes[subCounter].numberOfProcessors*sizeof(int),
+                            bdata(domains[currentDomain + subCounter].tag));
+                    return;
+                }
+
+                tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                          domains[currentDomain + subCounter].processorList,
+                                          i, offset,
+                                          domains[currentDomain + subCounter].numberOfProcessors);
+                domains[currentDomain + subCounter].numberOfProcessors = tmp;
                 offset += domains[currentDomain + subCounter].numberOfCores;
-
                 subCounter++;
             }
         }
-
-        /* This is redundant ;-). Create thread to node lookup */
-        for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
+    }
+    else
+    {
+        offset = 0;
+        int NUMAthreads = numberOfProcessorsPerSocket * numberOfSocketDomains;
+        domains[currentDomain + subCounter].numberOfProcessors = NUMAthreads;
+        domains[currentDomain + subCounter].numberOfCores =  NUMAthreads/cpuid_topology.numThreadsPerCore;
+        domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain M%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores);
+        domains[currentDomain + subCounter].processorList = (int*) malloc(NUMAthreads*sizeof(int));
+        if (!domains[currentDomain + subCounter].processorList)
         {
-            for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
-            {
-                affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
-            }
+            fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                    NUMAthreads*sizeof(int), 
+                    bdata(domains[currentDomain + subCounter].tag));
+            return;
+        }
+        tmp = 0;
+        for (int i=0; i < numberOfSocketDomains; i++ )
+        {
+            tmp += treeFillNextEntries(
+                cpuid_topology.topologyTree,
+                &(domains[currentDomain + subCounter].processorList[offset]),
+                i, 0, numberOfProcessorsPerSocket);
+            offset += numberOfProcessorsPerSocket;
         }
+        domains[currentDomain + subCounter].numberOfProcessors = tmp;
     }
 
     affinity_numberOfDomains = numberOfDomains;
+    affinityDomains.numberOfAffinityDomains = numberOfDomains;
+    affinityDomains.numberOfSocketDomains = numberOfSocketDomains;
+    affinityDomains.numberOfNumaDomains = numberOfNumaDomains;
+    affinityDomains.numberOfProcessorsPerSocket = numberOfProcessorsPerSocket;
+    affinityDomains.numberOfCacheDomains = numberOfCacheDomains;
+    affinityDomains.numberOfCoresPerCache = numberOfCoresPerCache;
+    affinityDomains.numberOfProcessorsPerCache = numberOfProcessorsPerCache;
+    affinityDomains.domains = domains;
+    affinity_initialized = 1;
 }
 
 
 void
 affinity_finalize()
 {
-    for ( int i=0; i < affinity_numberOfDomains; i++ )
+    if (affinity_initialized == 0)
+    {
+        return;
+    }
+    if (!affinityDomains.domains)
+    {
+        return;
+    }
+    for ( int i=0; i < affinityDomains.numberOfAffinityDomains; i++ )
     {
-        free(domains[i].processorList);
+        bdestroy(affinityDomains.domains[i].tag);
+        if (affinityDomains.domains[i].processorList != NULL)
+        {
+            free(affinityDomains.domains[i].processorList);
+        }
+        affinityDomains.domains[i].processorList = NULL;
+    }
+    if (affinityDomains.domains != NULL)
+    {
+        free(affinityDomains.domains);
     }
-    free(domains);
+    affinityDomains.domains = NULL;
+    affinity_numberOfDomains = 0;
+    affinityDomains.numberOfAffinityDomains = 0;
+    affinityDomains.numberOfSocketDomains = 0;
+    affinityDomains.numberOfNumaDomains = 0;
+    affinityDomains.numberOfProcessorsPerSocket = 0;
+    affinityDomains.numberOfCacheDomains = 0;
+    affinityDomains.numberOfCoresPerCache = 0;
+    affinityDomains.numberOfProcessorsPerCache = 0;
+    affinity_initialized = 0;
 }
 
 
@@ -347,6 +460,20 @@ affinity_pinProcess(int processorId)
     sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 }
 
+void
+affinity_pinProcesses(int cpu_count, int* processorIds)
+{
+    int i;
+    cpu_set_t cpuset;
+
+    CPU_ZERO(&cpuset);
+    for(i=0;i<cpu_count;i++)
+    {
+        CPU_SET(processorIds[i], &cpuset);
+    }
+    sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+}
+
 
 const AffinityDomain*
 affinity_getDomain(bstring domain)
@@ -364,22 +491,24 @@ affinity_getDomain(bstring domain)
 }
 
 void
-affinity_printDomains(FILE* OUTSTREAM)
+affinity_printDomains()
 {
-    if (OUTSTREAM)
+    for ( int i=0; i < affinity_numberOfDomains; i++ )
     {
-        for ( int i=0; i < affinity_numberOfDomains; i++ )
-        {
-            fprintf(OUTSTREAM, "Domain %d:\n", i);
-            fprintf(OUTSTREAM, "\tTag %s:", bdata(domains[i].tag));
+        printf("Domain %d:\n",i);
+        printf("\tTag %s:",bdata(domains[i].tag));
 
-            for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
-            {
-                fprintf(OUTSTREAM, " %d", domains[i].processorList[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-            fflush(OUTSTREAM);
+        for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
+        {
+            printf(" %d",domains[i].processorList[j]);
         }
+        printf("\n");
     }
 }
 
+AffinityDomains_t
+get_affinityDomains(void)
+{
+    return &affinityDomains;
+}
+
diff --git a/src/allocator.c b/src/allocator.c
deleted file mode 100644
index 83e8164..0000000
--- a/src/allocator.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  allocator.c
- *
- *      Description:  Implementation of allocator module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <allocator.h>
-#include <affinity.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int numberOfAllocatedVectors = 0;
-static void** allocations;
-
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-allocator_init(int numVectors)
-{
-    allocations = (void**) malloc(numVectors * sizeof(void*));
-}
-
-
-void
-allocator_finalize()
-{
-    int i;
-
-    for (i=0; i<numberOfAllocatedVectors; i++)
-    {
-        free(allocations[i]);
-    }
-}
-
-void
-allocator_allocateVector(
-        FILE* OUTSTREAM,
-        void** ptr,
-        int alignment,
-        uint64_t size,
-        int offset,
-        DataType type,
-        bstring domainString)
-{
-    size_t bytesize = 0;
-    const AffinityDomain* domain;
-    int errorCode;
-
-    switch ( type )
-    {
-        case SINGLE:
-        case SINGLE_RAND:
-            bytesize = (size+offset) * sizeof(float);
-            break;
-
-        case DOUBLE_RAND:
-        case DOUBLE:
-            bytesize = (size+offset) * sizeof(double);
-            break;
-    }
-
-    errorCode =  posix_memalign(ptr, alignment, bytesize);
-
-    if (errorCode)
-    {
-        if (errorCode == EINVAL) 
-        {
-            fprintf(stderr,
-                    "Alignment parameter is not a power of two\n");
-            exit(EXIT_FAILURE);
-        }
-        if (errorCode == ENOMEM) 
-        {
-            fprintf(stderr,
-                    "Insufficient memory to fulfill the request\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    if ((*ptr) == NULL)
-    {
-            fprintf(stderr, "posix_memalign failed!\n");
-            exit(EXIT_FAILURE);
-
-    }
-
-    allocations[numberOfAllocatedVectors] = *ptr;
-    numberOfAllocatedVectors++;
-    domain = affinity_getDomain(domainString);
-    affinity_pinProcess(domain->processorList[0]);
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Allocate: Process running on core %d - Vector length %llu Offset %d\n",
-            affinity_processGetProcessorId(),
-            LLU_CAST size,
-            offset);
-    }
-
-    switch ( type )
-    {
-        case SINGLE:
-            {
-                float* sptr = (float*) (*ptr);
-                sptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    sptr[i] = 1.0;
-                }
-                *ptr = (void*) sptr;
-
-            }
-            break;
-
-        case DOUBLE:
-            {
-                double* dptr = (double*) (*ptr);
-                dptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    dptr[i] = 1.0;
-                }
-                *ptr = (void*) dptr;
-            }
-            break;
-        case SINGLE_RAND:
-            {
-                srand((uint64_t)ptr);
-                float* sptr = (float*) (*ptr);
-                sptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    sptr[i] = rand()/((float)RAND_MAX)*2.0-1.0;
-                }
-                *ptr = (void*) sptr;
-            }
-            break;
-        case DOUBLE_RAND:
-            {
-                srand((uint64_t)ptr);
-                double* dptr = (double*) (*ptr);
-                dptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    dptr[i] = rand()/((double)RAND_MAX)*2.0-1.0;
-                }
-                *ptr = (void*) dptr;
-            }
-            break;
-        
-    }
-}
-
diff --git a/src/applications/likwid-agent.lua b/src/applications/likwid-agent.lua
new file mode 100644
index 0000000..3f3e59a
--- /dev/null
+++ b/src/applications/likwid-agent.lua
@@ -0,0 +1,559 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-agent.lua
+ *
+ *      Description:  A monitoring daemon for hardware performance counters.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+local base_groupPath = "<INSTALLED_PREFIX>/share/likwid/mongroups"
+dconfig = {}
+dconfig["groupStrings"] ={}
+dconfig["groupData"] ={}
+dconfig["accessmode"] = 1
+dconfig["duration"] = 1
+dconfig["groupPath"] = ""
+dconfig["logPath"] = nil
+dconfig["logStyle"] = "log"
+dconfig["gmetric"] = false
+dconfig["gmetricPath"] = "gmetric"
+dconfig["gmetricConfig"] = nil
+dconfig["gmetricHasUnit"] = false
+dconfig["gmetricHasGroup"] = false
+dconfig["rrd"] = false
+dconfig["rrdPath"] = "."
+dconfig["syslog"] = false
+dconfig["syslogPrio"] = "local0.notice"
+dconfig["stdout"] = false
+
+rrdconfig = {}
+
+
+local function read_daemon_config(filename)
+    if filename == nil or filename == "" then
+        print("Not a valid config filename")
+        os.exit(1)
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("Cannot open config file "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+
+        if not line:match("^#") then
+            if line:match("^GROUPPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["groupPath"] = linelist[1]
+            end
+
+            if line:match("^EVENTSET%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                for i=#linelist,0,-1 do
+                    if linelist[i] == "" then
+                        table.remove(linelist, i)
+                    else
+                        table.insert(dconfig["groupStrings"], linelist[i])
+                    end
+                end
+            end
+
+            if line:match("^DURATION%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["duration"] = tonumber(linelist[1])
+            end
+
+            if line:match("^ACCESSMODE%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["accessmode"] = tonumber(linelist[1])
+            end
+
+            if line:match("^LOGPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["logPath"] = linelist[1]
+            end
+
+            if line:match("^LOGSTYLE%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] ~= "log" and linelist[1] ~= "update" then
+                    print("LOGSTYLE argument not valid, available are log and update. Fallback to log.")
+                else
+                    dconfig["logStyle"] = linelist[1]
+                end
+            end
+
+            if line:match("^GMETRIC%s%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["gmetric"] = true
+                end
+            end
+
+            if line:match("^GMETRICPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["gmetricPath"] = linelist[1]
+            end
+
+            if line:match("^GMETRICCONFIG%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["gmetricConfig"] = linelist[1]
+            end
+
+            if line:match("^RRD%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["rrd"] = true
+                end
+            end
+
+            if line:match("^RRDPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["rrdPath"] = linelist[1]
+            end
+
+            if line:match("^SYSLOG%s%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["syslog"] = true
+                end
+            end
+
+            if line:match("^SYSLOGPRIO%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["syslogPrio"] = linelist[1]
+            end
+        end
+    end
+end
+
+local function calc_sum(key, results)
+    local sum = 0.0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        sum = sum + results[thread][key]
+    end
+    return sum
+end
+
+local function calc_avg(key, results)
+    local sum = 0.0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        sum = sum + results[thread][key]
+    end
+    return sum/numThreads
+end
+
+local function calc_min(key, results)
+    local min = math.huge
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        if results[thread][key] < min then
+            min = results[thread][key]
+        end
+    end
+    return min
+end
+
+local function calc_max(key, results)
+    local max = 0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        if results[thread][key] > max then
+            max = results[thread][key]
+        end
+    end
+    return max
+end
+
+local function check_logfile()
+    local g = os.execute("cd "..dconfig["logPath"], "r")
+    if g == false then
+        print("Logfile path".. dconfig["logPath"].. " does not exist.")
+        return false
+    end
+    return true
+end
+
+local function logfile(groupID, results)
+    open_function = "a"
+    if dconfig["logStyle"] == "update" then
+        open_function = "w"
+    end
+    filename = "likwid."..tostring(groupID)..".log"
+    local s,e = dconfig["groupData"][groupID]["GroupString"]:find(":")
+    if not s then
+        filename = "likwid."..dconfig["groupData"][groupID]["GroupString"]..".log"
+    end
+    local f = io.open(dconfig["logPath"].."/"..filename, open_function)
+    if f == nil then
+        print("Cannot open logfile ".. dconfig["logPath"].."/"..filename)
+        return
+    end
+    local timestamp = results["Timestamp"]
+    for k,v in pairs(results) do
+        if k ~= "Timestamp" then
+            f:write(timestamp..","..k:gsub("%(",""):gsub("%)","").. ","..v.."\n")
+        end
+    end
+    f:close()
+end
+
+local function check_logger()
+    cmd = "which logger"
+    local f = io.popen(cmd)
+    if f == nil then
+        return false
+    end
+    f:close()
+    return true
+end
+
+local function logger(results)
+    cmd = "logger -t LIKWID "
+    if dconfig["syslogPrio"] ~= nil then
+        cm = cmd .."-p "..dconfig["syslogPrio"].." "
+    end
+    local timestamp = results["Timestamp"]
+    for k,v in pairs(results) do
+        if k ~= "Timestamp" then
+            local resultcmd = cmd .. k:gsub("%(",""):gsub("%)","") .. " " ..v
+            local f = io.popen(resultcmd)
+            if f == nil then
+                print("Cannot use logger, maybe not in $PATH")
+                return
+            end
+            f:close()
+        end
+    end
+    
+end
+
+local function check_gmetric()
+    if dconfig["gmetricPath"] == nil then
+        return false
+    end
+    local f = io.popen(dconfig["gmetricPath"].." -h","r")
+    if f == nil then
+        return false
+    end
+    local msg = f:read("*a")
+    if msg:match("units=") then
+        dconfig["gmetricHasUnit"] = true
+    end
+    if msg:match("group=") then
+        dconfig["gmetricHasGroup"] = true
+    end
+    f:close()
+    return true
+end
+
+local function gmetric(gdata, results)
+    execList = {}
+    if dconfig["gmetricPath"] == nil then
+        return
+    end
+    table.insert(execList, dconfig["gmetricPath"])
+    if dconfig["gmetricConfig"] ~= nil then
+        table.insert(execList, "-c")
+        table.insert(execList, dconfig["gmetricConfig"])
+    end
+    if dconfig["gmetricHasGroup"] and gdata["GroupString"] ~= gdata["EventString"] then
+        table.insert(execList, "-g")
+        table.insert(execList, gdata["GroupString"])
+    end
+    for k,v in pairs(results) do
+        local execStr = table.concat(execList, " ")
+        if k ~= "Timestamp" then
+            execStr = execStr .. " -t double "
+
+            local name = k
+            local unit = nil
+            local s,e = k:find("%[")
+            if s ~= nil then
+                name = k:sub(0,s-2):gsub("^%s*(.-)%s*$", "%1")
+                unit = k:sub(s+1,k:len()-1):gsub("^%s*(.-)%s*$", "%1")
+            end
+            execStr = execStr .. " --name=\"" .. name .."\""
+            if dconfig["gmetricHasUnit"] and unit ~= nil then
+                execStr = execStr .. " --units=\"" .. unit .."\""
+            end
+            local value = tonumber(v)
+            if v ~= nil and value ~= nil then
+                execStr = execStr .. " --value=\"" .. string.format("%f", value) .."\""
+            elseif v ~= nil then
+                execStr = execStr .. " --value=\"" .. tostring(v) .."\""
+            else
+                execStr = execStr .. " --value=\"0\""
+            end
+            os.execute(execStr)
+        end
+    end
+end
+
+local function normalize_rrd_string(str)
+    str = str:gsub(" ","_")
+    str = str:gsub("%(","")
+    str = str:gsub("%)","")
+    str = str:gsub("%[","")
+    str = str:gsub("%]","")
+    str = str:gsub("%/","")
+    str = str:sub(1,19)
+    return str
+end
+
+local function check_rrd()
+    local f = io.popen("rrdtool")
+    if f == nil then
+        return false
+    end
+    f:close()
+    return true
+end
+
+local function create_rrd(numGroups, duration, groupData)
+    local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+    local rrdstring = "rrdtool create "..rrdname.." --step ".. tostring(numGroups*duration)
+    if rrdconfig[groupData["GroupString"]] == nil then
+        rrdconfig[groupData["GroupString"]] = {}
+    end
+    for i, metric in pairs(groupdata["Metrics"]) do
+        rrdstring = rrdstring .. " DS"..":" .. normalize_rrd_string(metric["description"]) ..":GAUGE:"
+        rrdstring = rrdstring ..tostring(numGroups*duration) ..":0:U"
+        table.insert(rrdconfig[groupData["GroupString"]], metric["description"])
+    end
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(60/duration)..":10"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(60/duration)..":10"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(60/duration)..":10"
+    --Average, min and max of hours of last day
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(3600/duration)..":24"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(3600/duration)..":24"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(3600/duration)..":24"
+    --Average, min and max of day of last month
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(86400/duration)..":31"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(86400/duration)..":31"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(86400/duration)..":31"
+    os.execute(rrdstring)
+end
+
+local function rrd(groupData, results)
+    local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+    local rrdstring = "rrdtool update "..rrdname.." N"
+    for i, id in pairs(rrdconfig[groupData["GroupString"]]) do
+        rrdstring = rrdstring .. ":" .. tostring(results[id])
+    end
+    os.execute(rrdstring)
+end
+
+-- Read commandline arguments
+if #arg ~= 1 then
+    print("Usage:")
+    print(arg[0] .. " <configFile>")
+    os.exit(1)
+end
+
+-- Get architectural information for the current system
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local affinity = likwid.getAffinityInfo()
+-- Read LIKWID configuration file, mainly to avoid topology lookup
+local config = likwid.getConfiguration()
+-- Read LIKWID daemon configuration file
+read_daemon_config(arg[1])
+
+-- Set force mode, we are monitoring exclusively
+likwid.setenv("LIKWID_FORCE","1")
+
+if dconfig["groupPath"] ~= "" then
+    likwid.setGroupPath(dconfig["groupPath"])
+else
+    dconfig["groupPath"] = base_groupPath
+end
+
+if #dconfig["groupStrings"] == 0 then
+    print("No monitoring groups defined, exiting...")
+    os.exit(1)
+end
+if dconfig["duration"] == 0 then
+    print("Invalid value 0 for duration. Sanitizing to 1 second.")
+    dconfig["duration"] = 1
+end
+
+if dconfig["syslog"] then
+    if check_logger() == false then
+        print("Cannot find tool logger, disabling syslog output.")
+        dconfig["syslog"] = false
+    end
+end
+if dconfig["logPath"] then
+    if check_logfile() == false then
+        print("Cannot create logfile path "..dconfig["logPath"]..". Deactivating logfile output.")
+        dconfig["logPath"] = nil
+    end
+end
+if dconfig["gmetric"] then
+    if check_gmetric() == false then
+        print("Cannot find gmetric using path "..dconfig["gmetricPath"]..". Deactivating gmetric output.")
+        dconfig["gmetric"] = false
+    end
+end
+if dconfig["rrd"] then
+    if check_rrd() == false then
+        print("Cannot find rrdtool. Deactivating rrd output.")
+        dconfig["rrd"] = false
+    end
+end
+
+-- Activate output to stdout only if no other backend is set
+if dconfig["logPath"] == nil and dconfig["rrd"] == false and dconfig["gmetric"] == false and dconfig["syslog"] == false then
+    dconfig["stdout"] = true
+end
+
+-- Add all cpus to the cpulist
+local cpulist = {}
+for i=0, cputopo["numHWThreads"]-1 do
+    table.insert(cpulist, cputopo["threadPool"][i]["apicId"])
+end
+
+-- Select access mode to msr devices, try configuration file first
+access_mode = dconfig["accessmode"]
+if access_mode < 0 or access_mode > 1 then
+    access_mode = 1
+end
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+    os.exit(1)
+end
+
+-- Select group directory for monitoring
+likwid.groupfolder = dconfig["groupPath"]
+
+power = likwid.getPowerInfo()
+-- Initialize likwid perfctr
+likwid.init(cputopo["numHWThreads"], cpulist)
+for k,v in pairs(dconfig["groupStrings"]) do
+    local groupID = likwid.addEventSet(v)
+    table.insert(dconfig["groupData"], groupID, v)
+    if dconfig["rrd"] then
+        create_rrd(#dconfig["groupStrings"], dconfig["duration"], v)
+    end
+end
+
+likwid.catchSignal()
+while likwid.getSignalState() == 0 do
+
+    for groupID,gname in pairs(dconfig["groupData"]) do
+        local old_mtime = likwid_getRuntimeOfGroup(groupID)
+        local cur_time = os.time()
+        likwid.setupCounters(groupID)
+
+        -- Perform the measurement
+        likwid.startCounters()
+        likwid.sleep(dconfig["duration"] * 1E6)
+        likwid.stopCounters()
+
+
+        if likwid.getNumberOfMetrics(groupID) > 0 then
+            local threadOutput = {}
+            for i=1, likwid.getNumberOfMetrics(groupID) do
+                local metricdesc = likwid.getNameOfMetric(groupID, i)
+                for thread=1, likwid.getNumberOfThreads() do
+                    if threadOutput[thread] == nil then
+                        threadOutput[thread] = {}
+                    end
+                    --local result = likwid.calculate_metric(metric["formula"], threadResults[thread])
+                    threadOutput[thread][metricdesc] = likwid.getLastMetric(groupID, i, thread)
+                end
+            end
+            output = {}
+            output["Timestamp"] = os.date("%m/%d/%Y_%X",cur_time)
+            for i=1, likwid.getNumberOfMetrics(groupID) do
+                local metricdesc = likwid.getNameOfMetric(groupID, i)
+                itemlist = likwid.stringsplit(metricdesc, "%s+", nil, "%s+")
+                func = itemlist[1]
+                table.remove(itemlist, 1)
+                desc = table.concat(itemlist," ")
+                if func == "AVG" then
+                    output[metricdesc:gsub(" ","_")] = calc_avg(metricdesc, threadOutput)
+                elseif func == "SUM" then
+                    output[metricdesc:gsub(" ","_")] = calc_sum(metricdesc, threadOutput)
+                elseif func == "MIN" then
+                    output[metricdesc:gsub(" ","_")] = calc_min(metricdesc, threadOutput)
+                elseif func == "MAX" then
+                    output[metricdesc:gsub(" ","_")] = calc_max(metricdesc, threadOutput)
+                elseif func == "ONCE" then
+                    output[metricdesc:gsub(" ","_")] = threadOutput[1][metricdesc]
+                else
+                    for thread=1, likwid.getNumberOfThreads() do
+                        output["T"..cpulist[thread] .. "_" .. metricdesc] = threadOutput[thread][metricdesc]
+                    end
+                end
+            end
+            if dconfig["logPath"] ~= nil then
+                logfile(groupID, output)
+            end
+            if dconfig["syslog"] then
+                logger(output)
+            end
+            if dconfig["gmetric"] then
+                gmetric(gdata, output)
+            end
+            if dconfig["rrd"] then
+                rrd(gdata, output)
+            end
+            if dconfig["stdout"] then
+                for i,o in pairs(output) do
+                    print(i,o)
+                end
+                print(likwid.hline)
+            end
+        end
+    end
+end
+
+-- Finalize likwid perfctr
+likwid.catchSignal()
+likwid.finalize()
+likwid.putConfiguration()
+likwid.putTopology()
diff --git a/src/applications/likwid-bench.c b/src/applications/likwid-bench.c
deleted file mode 100644
index 15f6f0d..0000000
--- a/src/applications/likwid-bench.c
+++ /dev/null
@@ -1,536 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-bench.c
- *
- *      Description:  A flexible and extensible benchmarking toolbox
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <bstrlib.h>
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <timer.h>
-#include <threads.h>
-#include <barrier.h>
-#include <testcases.h>
-#include <strUtil.h>
-#include <allocator.h>
-
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#include <omp.h>
-#endif
-
-extern void* runTest(void* arg);
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "\n"); \
-    fprintf(stdout, "Supported Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-q\t Silent without output\n"); \
-    fprintf(stdout, "-a\t list available benchmarks \n"); \
-    fprintf(stdout, "-p\t list available thread domains\n"); \
-    fprintf(stdout, "-l <TEST>\t list properties of benchmark \n"); \
-    fprintf(stdout, "-i <INT>\t number of iterations \n"); \
-    fprintf(stdout, "-g <INT>\t number of workgroups (mandatory)\n"); \
-    fprintf(stdout, "-t <TEST>\t type of test \n"); \
-    fprintf(stdout, "-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>][-<streamId>:<domain_id>[:<offset>]], size in kB, MB or GB  (mandatory)\n"); \
-    fprintf(stdout, "Processors are in compact ordering. Optionally every stream can be placed. Either no stream or all streams must be placed. Multiple streams are separated by commas.\n"); \
-    fprintf(stdout, "Usage: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:10:1:2 \n"); \
-    fprintf(stdout, "\tRun 10 threads on socket 0 using physical cores only (presuming SMT2 system).\n"); \
-    fprintf(stdout, "Example with data placement: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:20-0:S1,1:S1 \n"); \
-    fprintf(stdout, "\tRun 20 threads on socket 0 and place both arrays of the copy test case on socket 1.\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-bench   %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ############ */
-
-void copyThreadData(ThreadUserData* src,ThreadUserData* dst)
-{
-    uint32_t i;
-
-    *dst = *src;
-    dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
-    dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
-
-    for (i=0; i<  src->test->streams; i++)
-    {
-        dst->streams[i] = src->streams[i];
-    }
-
-    for (i=0; i<src->numberOfThreads; i++)
-    {
-        dst->processors[i] = src->processors[i];
-    }
-}
-
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main(int argc, char** argv)
-{
-    int iter = 100;
-    uint32_t i;
-    uint32_t j;
-    int globalNumberOfThreads = 0;
-    int optPrintDomains = 0;
-    int c;
-    ThreadUserData myData;
-    bstring testcase = bfromcstr("none");
-    uint32_t numberOfWorkgroups = 0;
-    int tmp = 0;
-    double time;
-    const TestCase* test = NULL;
-    Workgroup* currentWorkgroup = NULL;
-    Workgroup* groups = NULL;
-    FILE* OUTSTREAM = stdout;
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-    affinity_init();
-
-    /* Handling of command line options */
-    if (argc ==  1)
-    {
-        HELP_MSG;
-        affinity_finalize();
-        exit(EXIT_SUCCESS);
-    }
-    opterr = 0;
-    while ((c = getopt (argc, argv, "g:w:t:i:l:aphvq")) != -1) {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'a':
-                if (OUTSTREAM)
-                {
-                    fprintf(OUTSTREAM, TESTS"\n");
-                    fflush(OUTSTREAM);
-                }
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'q':
-                OUTSTREAM = NULL;
-                break;
-            case 'w':
-                tmp--;
-
-                if (tmp == -1)
-                {
-                    fprintf (stderr, "More workgroups configured than allocated!\n"
-                        "Did you forget to set the number of workgroups with -g?\n");
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                if (!test)
-                {
-                    fprintf (stderr, "You need to specify a test case first!\n");
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                testcase = bfromcstr(optarg);
-                currentWorkgroup = groups+tmp;  /*FIXME*/
-                bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams);
-                bdestroy(testcase);
-
-                for (i=0; i<  test->streams; i++)
-                {
-                    if (currentWorkgroup->streams[i].offset%test->stride)
-                    {
-                        fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
-                        affinity_finalize();
-                        if (groups)
-                        {
-                            free(groups);
-                        }
-                        return EXIT_FAILURE;
-                    }
-
-                    allocator_allocateVector(OUTSTREAM,
-                            &(currentWorkgroup->streams[i].ptr),
-                            PAGE_ALIGNMENT,
-                            currentWorkgroup->size,
-                            currentWorkgroup->streams[i].offset,
-                            test->type,
-                            currentWorkgroup->streams[i].domain);
-                }
-
-                break;
-            case 'i':
-                iter =  atoi(optarg);
-                if (iter <= 0)
-                {
-                    fprintf(stderr, "Iterations must be greater than 0.\n");
-                    exit(EXIT_FAILURE);
-                }
-                break;
-            case 'l':
-                testcase = bfromcstr(optarg);
-                for (i=0; i<NUMKERNELS; i++)
-                {
-                    if (biseqcstr(testcase, kernels[i].name))
-                    {
-                        test = kernels+i;
-                        break;
-                    }
-                }
-
-                if (biseqcstr(testcase,"none") || !test)
-                {
-                    fprintf (stderr, "Unknown test case %s\n",optarg);
-                    if (OUTSTREAM)
-                    {
-                        fprintf(OUTSTREAM, "Available test cases:\n");
-                        fprintf(OUTSTREAM, TESTS"\n");
-                        fflush(OUTSTREAM);
-                    }
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                else
-                {
-                    if (OUTSTREAM)
-                    {
-                        fprintf(OUTSTREAM, "Name: %s\n",test->name);
-                        fprintf(OUTSTREAM, "Number of streams: %d\n",test->streams);
-                        fprintf(OUTSTREAM, "Loop stride: %d\n",test->stride);
-                        fprintf(OUTSTREAM, "Flops: %d\n", (int) test->flops);
-                        fprintf(OUTSTREAM, "Bytes: %d\n",test->bytes);
-                        switch (test->type)
-                        {
-                            case SINGLE:
-                                fprintf(OUTSTREAM, "Data Type: Single precision float\n");
-                                break;
-                            case DOUBLE:
-                                fprintf(OUTSTREAM, "Data Type: Double precision float\n");
-                                break;
-                        }
-                        fflush(OUTSTREAM);
-                    }
-                }
-                bdestroy(testcase);
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-
-                break;
-            case 'p':
-                optPrintDomains = 1;
-                break;
-            case 'g':
-                numberOfWorkgroups =  atoi(optarg);
-                if (numberOfWorkgroups <= 0)
-                {
-                    fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
-                    exit(EXIT_FAILURE);
-                }
-                allocator_init(numberOfWorkgroups * MAX_STREAMS);
-                tmp = numberOfWorkgroups;
-                groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
-                break;
-            case 't':
-                testcase = bfromcstr(optarg);
-
-                for (i=0; i<NUMKERNELS; i++)
-                {
-                    if (biseqcstr(testcase, kernels[i].name))
-                    {
-                        test = kernels+i;
-                        break;
-                    }
-                }
-                if (biseqcstr(testcase,"none"))
-                {
-                    fprintf (stderr, "Unknown test case %s\n",optarg);
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                bdestroy(testcase);
-                break;
-            case '?':
-                if (optopt == 'l' || optopt == 'g' || optopt == 'w' || 
-                        optopt == 't' || optopt == 'i')
-                    fprintf (stderr, "Option `-%c' requires an argument.\n", optopt);
-                else if (isprint (optopt))
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                else
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-        }
-    }
-
-    if (numberOfWorkgroups == 0 && !optPrintDomains)
-    {
-        fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (tmp > 0 && iter > 0)
-    {
-        fprintf(stderr, "%d workgroups requested but only %d given on commandline\n",numberOfWorkgroups,numberOfWorkgroups-tmp);
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (iter <= 0)
-    {
-        fprintf(stderr,"Iterations must be greater than 0\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (test && !(currentWorkgroup || groups))
-    {
-        fprintf(stderr, "Workgroups must be set on commandline\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPrintDomains)
-    {
-        affinity_printDomains(OUTSTREAM);
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit (EXIT_SUCCESS);
-    }
-    timer_init();
-
-    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
-     * module only allows equally sized thread groups*/
-    for (i=0; i<numberOfWorkgroups; i++)
-    {
-        globalNumberOfThreads += groups[i].numberOfThreads;
-    }
-
-    threads_init(OUTSTREAM, globalNumberOfThreads);
-    threads_createGroups(numberOfWorkgroups);
-
-    /* we configure global barriers only */
-    barrier_init(1);
-    barrier_registerGroup(globalNumberOfThreads);
-
-#ifdef PERFMON
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Using likwid\n");
-        fflush(OUTSTREAM);
-    }
-    likwid_markerInit();
-#endif
-#ifdef PAPI
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Using PAPI\n");
-    }
-    PAPI_library_init (PAPI_VER_CURRENT);
-    PAPI_thread_init((unsigned long (*)(void))(omp_get_thread_num));
-#endif
-
-
-    /* initialize data structures for threads */
-    for (i=0; i<numberOfWorkgroups; i++)
-    {
-        myData.iter = iter;
-        myData.size = groups[i].size;
-        myData.test = test;
-        myData.numberOfThreads = groups[i].numberOfThreads;
-        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
-        myData.streams = (void**) malloc(test->streams * sizeof(void*));
-
-        for (j=0; j<groups[i].numberOfThreads; j++)
-        {
-            myData.processors[j] = groups[i].processorIds[j];
-        }
-
-        for (j=0; j<  test->streams; j++)
-        {
-            myData.streams[j] = groups[i].streams[j].ptr;
-        }
-        threads_registerDataGroup(i, &myData, copyThreadData);
-
-        free(myData.processors);
-        free(myData.streams);
-    }
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, HLINE);
-        fprintf(OUTSTREAM, "LIKWID MICRO BENCHMARK\n");
-        fprintf(OUTSTREAM, "Test: %s\n",test->name);
-        fprintf(OUTSTREAM, HLINE);
-        fprintf(OUTSTREAM, "Using %d work groups\n",numberOfWorkgroups);
-        fprintf(OUTSTREAM, "Using %d threads\n",globalNumberOfThreads);
-        fprintf(OUTSTREAM, HLINE);
-        fflush(OUTSTREAM);
-    }
-
-    threads_create(runTest);
-    threads_join();
-    allocator_finalize();
-
-    uint32_t realSize = 0;
-    uint64_t realCycles = 0;
-    int current_id = 0;
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, HLINE);
-        for(j=0;j<numberOfWorkgroups;j++)
-        {
-            current_id = j*groups[j].numberOfThreads;
-            realCycles += threads_data[current_id].cycles;
-            realSize += groups[j].numberOfThreads * threads_data[current_id].data.size;
-        }
-        time = (double) realCycles / (double) timer_getCpuClock();
-        fprintf(OUTSTREAM, "Cycles: %llu \n", LLU_CAST realCycles);
-        fprintf(OUTSTREAM, "Iterations: %llu \n", LLU_CAST iter);
-        fprintf(OUTSTREAM, "Size %d \n",  realSize );
-        fprintf(OUTSTREAM, "Vectorlength: %llu \n", LLU_CAST threads_data[current_id].data.size);
-        fprintf(OUTSTREAM, "Time: %e sec\n", time);
-        fprintf(OUTSTREAM, "Number of Flops: %llu \n", LLU_CAST (iter * realSize *  test->flops));
-        fprintf(OUTSTREAM, "MFlops/s: %.2f\n",
-                1.0E-06 * ((double) iter * realSize *  test->flops/  time));
-        fprintf(OUTSTREAM, "MByte/s: %.2f\n",
-                1.0E-06 * ( (double) iter * realSize *  test->bytes/ time));
-        fprintf(OUTSTREAM, "Cycles per update: %f\n",
-                ((double) realCycles / (double) (iter * numberOfWorkgroups * threads_data[current_id].numberOfThreads *  threads_data[current_id].data.size)));
-
-        switch ( test->type )
-        {
-            case SINGLE:
-                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
-                        (16.0 * (double) realCycles / (double) (iter * realSize)));
-                break;
-            case DOUBLE:
-                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
-                        (8.0 * (double) realCycles / (double) (iter * realSize)));
-                break;
-        }
-
-        fprintf(OUTSTREAM, HLINE);
-        fflush(OUTSTREAM);
-    }
-    threads_destroy(numberOfWorkgroups);
-    barrier_destroy();
-    
-    affinity_finalize();
-#ifdef PERFMON
-    likwid_markerClose();
-#endif
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-features.c b/src/applications/likwid-features.c
deleted file mode 100644
index 6fe5477..0000000
--- a/src/applications/likwid-features.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-features.c
- *
- *      Description:  An application to read out and set the feature flag
- *                  register on Intel Core 2 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <strUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <cpuid.h>
-#include <cpuFeatures.h>
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-features --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool to print and toggle the feature flag msr on Intel CPUS.\n"); \
-    fprintf(stdout, "Supported Features: HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER.\n\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-s <FEATURE>\t set cpu feature \n"); \
-    fprintf(stdout, "-u <FEATURE>\t unset cpu feature \n"); \
-    fprintf(stdout, "-c <ID>\t core id\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-features  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-int main (int argc, char** argv)
-{ 
-    int socket_fd = -1;
-    int optSetFeature = 0;
-    int cpuId = 0;
-    int c;
-    bstring argString;
-    CpuFeature feature = HW_PREFETCHER ;
-
-    while ((c = getopt (argc, argv, "c:s:u:hv")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'u':
-                optSetFeature = 2;
-            case 's':
-                if (! (argString = bSecureInput(40,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                if (biseqcstr(argString,"HW_PREFETCHER"))
-                {
-                    feature = HW_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"CL_PREFETCHER"))
-                {
-                    feature = CL_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"DCU_PREFETCHER"))
-                {
-                    feature = DCU_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"IP_PREFETCHER"))
-                {
-                    feature = IP_PREFETCHER;
-                }
-                else
-                {
-                    fprintf(stderr,"Feature not supported!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-
-                if (!optSetFeature)
-                {
-                    optSetFeature = 1;
-                }
-                break;
-            case 'c':
-                if (! (argString = bSecureInput(20,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                cpuId = str2int((char*) argString->data);
-
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fprintf(stdout, "CPU core id:\t%d \n", cpuId);
-    fflush(stdout);
-
-    if (cpuid_info.family != P6_FAMILY)
-    {
-        fprintf (stderr, "likwid-features only supports Intel P6 based processors!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (cpuId >= (int) cpuid_topology.numHWThreads)
-    {
-        fprintf (stderr, "This processor has only %d HWthreads! \n",cpuid_topology.numHWThreads);
-        exit(EXIT_FAILURE);
-    }
-
-    accessClient_init(&socket_fd);
-    msr_init(socket_fd);
-    cpuFeatures_init(cpuId);
-    cpuFeatures_print(cpuId);
-
-    if (optSetFeature == 1)
-    {
-        fprintf(stdout, SLINE);
-        cpuFeatures_enable(cpuId, feature);
-        fprintf(stdout, SLINE);
-    }
-    else if (optSetFeature == 2)
-    {
-        fprintf(stdout, SLINE);
-        cpuFeatures_disable(cpuId, feature);
-        fprintf(stdout, SLINE);
-    }
-    fflush(stdout);
-
-    msr_finalize();
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-features.lua b/src/applications/likwid-features.lua
new file mode 100644
index 0000000..37d765d
--- /dev/null
+++ b/src/applications/likwid-features.lua
@@ -0,0 +1,191 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-features.lua
+ *
+ *      Description:  A application to retrieve and manipulate CPU features.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+function version()
+    print(string.format("likwid-features --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool list and modify the states of CPU features.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-a, --all\t\t List all available features")
+    print("-l, --list\t\t List features and state for given CPUs")
+    print("-c, --cpus <list>\t Perform operations on given CPUs")
+    print("-e, --enable <list>\t List of features that should be enabled")
+    print("-d, --disable <list>\t List of features that should be disabled")
+    print()
+    print("Currently modifiable features:")
+    print("HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER")
+end
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+listFeatures = false
+num_cpus = 0
+cpulist = {}
+enableList = {}
+disableList = {}
+skipList = {}
+
+for opt,arg in likwid.getopt(arg, {"h","v","l","c:","e:","d:","a","help","version","list", "enable:", "disable:","all", "cpus:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "c" or opt == "cpus"then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+    elseif opt == "l" or opt == "list" then
+        listFeatures = true
+    elseif opt == "a" or opt == "all" then
+        print("Available features:")
+        for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+            if likwid.cpuFeatures[i]:match("PREFETCHER") then
+                print(string.format("\t%s*",likwid.cpuFeatures[i]))
+            else
+                print(string.format("\t%s",likwid.cpuFeatures[i]))
+            end
+        end
+        print("Modifiable features are marked with *")
+        os.exit(0)
+    elseif opt == "e" or opt == "enable" then
+        local tmp = likwid.stringsplit(arg, ",")
+        for i, f in pairs(tmp) do
+            for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+                if likwid.cpuFeatures[i] == f then
+                    table.insert(enableList, i)
+                end
+            end
+        end
+    elseif opt == "d" or opt == "disable" then
+        local tmp = likwid.stringsplit(arg, ",")
+        for i, f in pairs(tmp) do
+            for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+                if likwid.cpuFeatures[i] == f then
+                    table.insert(disableList, i)
+                end
+            end
+        end
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+likwid.initCpuFeatures()
+
+if listFeatures and #cpulist > 0 then
+    local str = "Feature"..string.rep(" ",string.len("BRANCH_TRACE_STORAGE")-string.len("Feature")+2)
+    for j, c in pairs(cpulist) do
+        str = str..string.format("CPU %d\t",c)
+    end
+    print(str)
+    str = ""
+    for i=0,likwid.tablelength(likwid.cpuFeatures)-1 do
+        str = likwid.cpuFeatures[i]..string.rep(" ",string.len("BRANCH_TRACE_STORAGE")-string.len(likwid.cpuFeatures[i])+2)
+        for j, c in pairs(cpulist) do
+            if (likwid.getCpuFeatures(c, i) == 1) then
+                str = str .. "on\t"
+            else
+                str = str .. "off\t"
+            end
+        end
+        print(str)
+    end
+elseif #cpulist == 0 then
+    print("Need CPU to list current feature state")
+    os.exit(1)
+end
+
+if #enableList > 0 and #disableList > 0 then
+    for i,e in pairs(enableList) do
+        for j, d in pairs(disableList) do
+            if (e == d) then
+                print(string.format("Feature %s is in enable and disable list, doing nothing for feature", e))
+                table.insert(skipList, e)
+            end
+        end
+    end
+    for i, s in pairs(skipList) do
+        for j, e in pairs(enableList) do
+            if (s == e) then table.remove(enableList, j) end
+        end
+        for j, e in pairs(disableList) do
+            if (s == e) then table.remove(disableList, j) end
+        end
+    end
+end
+
+if #enableList > 0 then
+    for i, c in pairs(cpulist) do
+        for j, f in pairs(enableList) do
+            local ret = likwid.enableCpuFeatures(c, f, 1)
+            if ret == 0 then
+                print(string.format("Enabled %s for CPU %d", likwid.cpuFeatures[f], c))
+            else
+                print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+            end
+        end
+    end
+end
+if #disableList > 0 then
+    for i, c in pairs(cpulist) do
+        for j, f in pairs(disableList) do
+            local ret = likwid.disableCpuFeatures(c, f, 1)
+            if ret == 0 then
+                print(string.format("Disabled %s for CPU %d", likwid.cpuFeatures[f], c))
+            else
+                print(string.format("Failed %s for CPU %d", likwid.cpuFeatures[f], c))
+            end
+        end
+    end
+end
diff --git a/src/applications/likwid-genCfg.c b/src/applications/likwid-genCfg.c
deleted file mode 100644
index 97147fd..0000000
--- a/src/applications/likwid-genCfg.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-genCfg.c
- *
- *      Description:  An application to dump the cpu topology information to
- *      a config file.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-genCfg --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool to dump node topology information into a file.\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-o\t output file path (optional)\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-genCfg  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    FILE *file;
-    char *filepath = TOSTRING(CFGFILE);
-    size_t size;
-    int c;
-
-    while ((c = getopt (argc, argv, "ho:v")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'o':
-                filepath = optarg;
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    cpuid_init();
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fflush(stdout);
-
-    if ((file = fopen(filepath, "wb")) != NULL) 
-    {
-        size = fwrite((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
-        size = fwrite((void*) cpuid_topology.threadPool,
-                sizeof(HWThread), cpuid_topology.numHWThreads, file);
-
-        size = fwrite((void*) cpuid_topology.cacheLevels,
-                sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
-
-        fclose(file);
-    }
-    else
-    {
-        fprintf(stderr,"Cfg file could not be written to %s\n", filepath);
-        ERROR;
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-genTopoCfg.lua b/src/applications/likwid-genTopoCfg.lua
new file mode 100644
index 0000000..fdd4d69
--- /dev/null
+++ b/src/applications/likwid-genTopoCfg.lua
@@ -0,0 +1,153 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-genTopoCfg.lua
+ *
+ *      Description:  A application to create a file of the underlying system configuration
+ *                    that is used by likwid to avoid reading the systems architecture at
+ *                    each start.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local filename = "<INSTALLED_PREFIX>/etc/likwid_topo.cfg"
+
+function version()
+    print(string.format("likwid-genTopoCfg --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to store the system's architecture to a config file for LIKWID.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-o, --output <file>\t Use <file> instead of default "..filename)
+    print("\t\t\t Likwid searches at startup per default:")
+    print("\t\t\t /etc/likwid_topo.cfg and <INSTALLED_PREFIX>/etc/likwid_topo.cfg")
+    print("\t\t\t Another location can be configured in the configuration file /etc/likwid.cfg,")
+    print("\t\t\t <INSTALLED_PREFIX>/etc/likwid.cfg or the path defined at the build process of Likwid.")
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","help","version", "o:", "output:"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "o" or opt == "output" then
+        filename = arg
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+local file = io.open(filename, "r")
+if file ~= nil then
+    print("File "..filename.." exists, please delete it first.")
+    file:close()
+    os.exit(1)
+end
+file = io.open(filename, "w")
+if file == nil then
+    print("Cannot open file "..filename.." for writing")
+    os.exit(1)
+end
+
+
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+if cpuinfo == nil or cputopo == nil or numainfo == nil or affinity == nil then
+    print("Cannot initialize topology module of LIKWID")
+    os.exit(1)
+end
+print(string.format("Writing new topology file %s", filename))
+cpuinfo["clock"] = likwid.getCpuClock()
+
+local threadPool_order = {"threadId", "coreId", "packageId", "apicId"}
+local cacheLevels_order = {"type", "associativity", "sets", "lineSize", "size", "threads", "inclusive"}
+
+for field, value in pairs(cpuinfo) do
+    file:write("cpuid_info " .. field .. " = " .. tostring(value).."\n")
+end
+
+for field, value in pairs(cputopo) do
+    if (field ~= "threadPool" and field ~= "cacheLevels" and field ~= "topologyTree") then
+        if field ~= "activeHWThreads" then
+            file:write("cpuid_topology " .. field .. " = " .. tostring(value).."\n")
+        end
+    elseif (field == "threadPool") then
+        --file:write("cpuid_topology threadPool count = "..tostring(likwid.tablelength(cputopo["threadPool"])).."\n")
+        for id, tab in pairs(cputopo["threadPool"]) do
+            str = "cpuid_topology threadPool "..tostring(id).." "
+            for k,v in pairs(threadPool_order) do
+                file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+            end
+            
+        end
+    elseif (field == "cacheLevels") then
+        for id, tab in pairs(cputopo["cacheLevels"]) do
+            str = "cpuid_topology cacheLevels "..tostring(id).." "
+            for k,v in pairs(cacheLevels_order) do
+                file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+            end
+            
+        end
+    end
+end
+
+file:write("numa_info numberOfNodes = "..tostring(numainfo["numberOfNodes"]).."\n")
+for field, value in pairs(numainfo["nodes"]) do
+    for id, tab in pairs(value) do
+        if id ~= "processors" and id ~= "distances" then
+            file:write("numa_info nodes "..tostring(field).." "..tostring(id).." = "..tostring(tab).."\n")
+        elseif id == "processors" then
+            for k,v in pairs(tab) do 
+                str = str..","..tostring(v) 
+                file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k).." = "..tostring(v).."\n")
+            end
+        elseif id == "distances" then
+            for k,v in pairs(tab) do
+                for k1,v1 in pairs(v) do
+                    file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k1).." = "..tostring(v1).."\n")
+                end
+            end
+        end
+    end
+end
+
+file:close()
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+
diff --git a/src/applications/likwid-memsweeper.c b/src/applications/likwid-memsweeper.c
deleted file mode 100644
index 4806763..0000000
--- a/src/applications/likwid-memsweeper.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-memsweeper.c
- *
- *      Description:  An application to clean up NUMA memory domains.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <memsweep.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-memsweeper --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool clean up NUMA memory domains and last level caches.\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-q\t Silent without output\n"); \
-    fprintf(stdout, "-c\t Specify NUMA domain ID to clean up\n"); \
-    fprintf(stdout, "\t If no specific domain is set, all domains are swept.\n"); \
-    fprintf(stdout, "Usage:\n"); \
-    fprintf(stdout, "To clean specific domain: likwid-memsweeper -c 2 \n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-memsweeper  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    int domainId = -1;
-    int c;
-    int optSilent = 0;
-    bstring argString;
-    FILE* OUTSTREAM = stdout;
-
-    while ((c = getopt (argc, argv, "+c:hvq")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'q':
-                optSilent = 1;
-                OUTSTREAM = NULL;
-                break;
-            case 'c':
-                if (! (argString = bSecureInput(10,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                domainId = str2int((char*) argString->data);
-
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-
-    if (domainId < 0) 
-    {
-        memsweep_node(OUTSTREAM);
-    }
-    else if (domainId < numa_info.numberOfNodes)
-    {
-        memsweep_domain(OUTSTREAM, domainId);
-    }
-    else
-    {
-        fprintf(stderr, "Unknown NUMA domain %d\n", domainId);
-        exit(EXIT_FAILURE);
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-memsweeper.lua b/src/applications/likwid-memsweeper.lua
new file mode 100644
index 0000000..d3315ac
--- /dev/null
+++ b/src/applications/likwid-memsweeper.lua
@@ -0,0 +1,89 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-memsweeper.lua
+ *
+ *      Description:  An application to clean up NUMA memory domains.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-memsweeper --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("To clean specific domain:")
+    print("likwid-memsweeper -c 2")
+    print("To clean a range of domains:")
+    print("likwid-memsweeper -c 1-2")
+    print("To clean specific domains:")
+    print("likwid-memsweeper -c 0,1-2")
+
+end
+
+local function usage()
+    version()
+    print("A tool clean up NUMA memory domains.\n")
+    print("Options:")
+    print("-h\t\t Help message")
+    print("-v\t\t Version information")
+    print("-c <list>\t Specify NUMA domain ID to clean up")
+    print("")
+    examples()
+end
+
+numainfo = likwid.getNumaInfo()
+nodes = {}
+for i,_ in pairs(numainfo["nodes"]) do
+    if tonumber(numainfo["nodes"][i]["id"]) ~= nil then
+        table.insert(nodes,numainfo["nodes"][i]["id"])
+    end
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "h", "v", "help", "version"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        num_nodes, nodes = likwid.nodestr_to_nodelist(arg)
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+for i,socket in pairs(nodes) do
+    likwid.memSweepDomain(socket)
+end
+likwid.putNumaInfo()
diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua
new file mode 100644
index 0000000..07d6dc4
--- /dev/null
+++ b/src/applications/likwid-mpirun.lua
@@ -0,0 +1,1967 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-mpirun.lua
+ *
+ *      Description: A wrapper script to pin threads spawned by MPI processes and 
+ *                   measure hardware performance counters
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-mpirun --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run 32 processes on hosts in hostlist")
+    print("likwid-mpirun -np 32 ./a.out")
+    print("")
+    print("Run 1 MPI process on each socket")
+    print("likwid-mpirun -nperdomain S:1 ./a.out")
+    print("Total amount of MPI processes is calculated using the number of hosts in the hostfile")
+    print("")
+    print("For hybrid MPI/OpenMP jobs you need to set the -pin option")
+    print("Starts 2 MPI processes on each host, one on socket 0 and one on socket 1")
+    print("Each MPI processes may start 2 OpenMP threads pinned to the first two CPUs on each socket")
+    print("likwid-mpirun -pin S0:0-1_S1:0-1 ./a.out")
+    print("")
+    print("Run 2 processes on each socket and measure the MEM performance group")
+    print("likwid-mpirun -nperdomain S:2 -g MEM ./a.out")
+    print("Only one process on a socket measures the Uncore/RAPL counters, the other one(s) only core-local counters")
+    print("")
+end
+
+local function usage()
+    version()
+    print("A wrapper script to pin threads spawned by MPI processes and measure hardware performance counters.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-d, --debug\t\t Debugging output")
+    print("-n/-np <count>\t\t Set the number of processes")
+    print("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
+    print("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
+    print("\t\t\t If not set, module system is checked")
+    print("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
+    print("\t\t\t Only required for statically linked executables.")
+    print("-hostfile\t\t Use custom hostfile instead of searching the environment")
+    print("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
+    print("-m/-marker\t\t Activate marker API mode")
+    print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+    print("-f\t\t\t Force overwrite of registers if they are in use. You can also use environment variable LIKWID_FORCE")
+    print("")
+    print("Processes are pinned to physical CPU cores first. For syntax questions see likwid-pin")
+    print("")
+    examples()
+end
+
+local np = 0
+local ppn = 0
+local nperdomain = nil
+local npernode = 0
+local cpuexprs = {}
+local perfexprs = {}
+local hostfile = nil
+local hosts = {}
+local perf = {}
+local mpitype = nil
+local omptype = nil
+local skipStr = ""
+local executable = {}
+local debug = false
+local use_marker = false
+local use_csv = false
+local force = false
+if os.getenv("LIKWID_FORCE") ~= nil then
+    force = true
+end
+
+local LIKWID_PIN="<INSTALLED_PREFIX>/bin/likwid-pin"
+local LIKWID_PERFCTR="<INSTALLED_PREFIX>/bin/likwid-perfctr"
+
+local readHostfile = nil
+local writeHostfile = nil
+local getEnvironment = nil
+local executeCommand = nil
+local mpiexecutable = nil
+
+
+local function readHostfileOpenMPI(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    if debug then
+        print("DEBUG: Reading hostfile in openmpi style")
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots, maxslots = line:match("^([%.%a%d]+)%s+slots=(%d*)%s+max%-slots=(%d*)")
+            if not hostname then
+                hostname, slots = line:match("^([%.%a%d]+)%s+slots=(%d*)")
+                if not hostname then
+                    hostname = line:match("^([%.%a%d]+)")
+                    slots = 1
+                    maxslots = 1
+                end
+            end
+            local found = false
+            for i, host in pairs(hostlist) do
+                if host["hostname"] == hostname then
+                    if slots and host["slots"] then
+                        host["slots"] = host["slots"] + tonumber(slots)
+                    end
+                    if maxslots and host["maxslots"] then
+                        host["maxslots"] = host["maxslots"] + tonumber(maxslots)
+                    end
+                    break
+                end
+            end
+            if not found then
+                table.insert(hostlist, {hostname=hostname, slots=tonumber(slots), maxslots=tonumber(maxslots)})
+            end
+        end
+    end
+    local topo = likwid.getCpuTopology()
+    for i,host in pairs(hostlist) do
+        if host["slots"] == nil or host["slots"] == 0 then
+            host["slots"] = topo.numHWThreads
+        end
+        if host["maxslots"] == nil or host["maxslots"] == 0 then
+            host["maxslots"] = topo.numHWThreads
+        end
+        if debug then
+            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileOpenMPI(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(" slots=%d", hostcontent["slots"])
+        end
+        if hostcontent["maxslots"] then
+            str = str .. string.format(" max-slots=%d", hostcontent["maxslots"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentOpenMPI()
+    return {}
+end
+
+local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes)
+    local bindstr = ""
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+
+    local f = io.popen(string.format("%s -V 2>&1", mpiexecutable), "r")
+    if f ~= nil then
+        local input = f:read("*a")
+        ver1,ver2,ver3 = input:match("(%d+)%.(%d+)%.(%d+)")
+        if ver1 == "1" then
+            if ver2 == "7" then
+                bindstr = "--bind-to none"
+            elseif ver2 == "6" then
+                bindstr = "--bind-to-none"
+            end
+        end
+        f:close()
+    end
+
+    local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s",
+                                mpiexecutable, hostfile, bindstr,
+                                np, ppn, wrapperscript)
+    if debug then
+        print("EXEC: "..cmd)
+    end
+    os.execute(cmd)
+end
+
+local function readHostfileIntelMPI(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    if debug then
+        print("DEBUG: Reading hostfile in intelmpi style")
+    end
+    local topo = likwid.getCpuTopology()
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+            if not hostname then
+                hostname = line:match("^([%.%a%d]+)")
+                slots = topo["numHWThreads"]
+            end
+            table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots})
+        end
+    end
+    if debug then
+        for i, host in pairs(hostlist) do
+            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileIntelMPI(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(":%d", hostcontent["slots"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentIntelMPI()
+    local env = {}
+    env['I_MPI_PIN']='off'
+    env['KMP_AFFINITY']='disabled'
+    return env
+end
+
+local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
+    local use_hydra = true
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+    if hostfile.sub(1,1) ~= "/" then
+        hostfile = os.getenv("PWD").."/"..hostfile
+    end
+    local path = ""
+    local f = io.popen(string.format("dirname %s", mpiexecutable))
+    if f ~= nil then
+        path = f:read("*line")
+        f:close()
+    end
+    if likwid.access(string.format("%s/mpdboot", path), "x") == 0 then
+        use_hydra = false
+    end
+    for i, env in pairs({"MPIHOME", "MPI_HOME", "MPI_ROOT", "MPI_BASE"}) do
+        if likwid.access(string.format("%s/bin/mpdboot", os.getenv(env)), "x") == 0 then
+            use_hydra = false
+            path = string.format("%s/bin",os.getenv(env))
+            break
+        end
+    end
+
+    local envstr = ""
+    for i, e in pairs(env) do
+        if use_hydra then
+            envstr = envstr .. string.format("-genv %s %s ", i, e)
+        else
+            envstr = envstr .. string.format("-env %s %s ", i, e)
+        end
+    end
+
+    if debug then
+        if use_hydra == false then
+            print(string.format("EXEC: %s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
+            print(string.format("EXEC: %s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+            print(string.format("EXEC: %s/mpdallexit", path))
+        else
+            print(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+        end
+    end
+
+    --os.execute(string.format("%s -genv I_MPI_PIN 0 -f %s -np %d -perhost %d %s",mpiexecutable, hostfile, np, ppn, wrapperscript))
+    if use_hydra == false then
+        os.execute(string.format("%s/mpdboot -r ssh -n %d -f %s", path, nrNodes, hostfile))
+        os.execute(string.format("%s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript))
+        os.execute(string.format("%s/mpdallexit", path))
+    else
+        os.execute(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript))
+    end
+end
+
+local function readHostfileMvapich2(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    if debug then
+        print("DEBUG: Reading hostfile in mvapich2 style")
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots, interface = line:match("^([%.%a%d]+):(%d+):([%a%d]+)")
+            if not hostname then
+                hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+                if not hostname then
+                    hostname = line:match("^([%.%a%d]+)")
+                    slots = 1
+                    interface = nil
+                else
+                    interface = nil
+                end
+            end
+            table.insert(hostlist, {hostname=hostname, slots=slots, maxslots=slots, interface=interface})
+        end
+    end
+    if debug then
+        for i, host in pairs(hostlist) do
+            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileMvapich2(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(":%d", hostcontent["slots"])
+        end
+        if hostcontent["interface"] then
+            str = str .. string.format(":%s", hostcontent["interface"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentMvapich2()
+    local env = {}
+    env['MV2_ENABLE_AFFINITY'] = "0"
+    return env
+end
+
+local function executeMvapich2(wrapperscript, hostfile, env, nrNodes)
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+    if hostfile.sub(1,1) ~= "/" then
+        hostfile = os.getenv("PWD").."/"..hostfile
+    end
+
+    local envstr = ""
+    for i, e in pairs(env) do
+        envstr = envstr .. string.format("%s=%s ", i, e)
+    end
+
+    local cmd = string.format("%s -f %s -np %d -ppn %d %s %s",
+                                mpiexecutable, hostfile,
+                                np, ppn, envstr, wrapperscript)
+    if debug then
+        print("EXEC: "..cmd)
+    end
+    os.execute(cmd)
+end
+
+
+local function readHostfilePBS(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    if debug then
+        print("DEBUG: Reading hostfile from batch system")
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname = line:match("^([%.%a%d]+)")
+            local found = false
+            for i, host in pairs(hostlist) do
+                if host["hostname"] == hostname then
+                    host["slots"] = host["slots"] + 1
+                    host["maxslots"] = host["maxslots"] + 1
+                    found = true
+                    break
+                end
+            end
+            if not found then
+                table.insert(hostlist, {hostname=hostname, slots=1, maxslots=1})
+            end
+        end
+    end
+    if debug then
+        for i, host in pairs(hostlist) do
+            print(string.format("DEBUG: Read host %s with %d slots and %d slots maximally", host["hostname"], host["slots"], host["maxslots"]))
+        end
+    end
+    return hostlist
+end
+
+local function readHostfileSlurm(hostlist)
+    nperhost = tonumber(os.getenv("SLURM_TASKS_PER_NODE"):match("(%d+)"))
+    if hostlist and nperhost then
+        hostfile = write_hostlist_to_file(hostlist, nperhost)
+        hosts = readHostfilePBS(hostfile)
+        os.remove(hostfile)
+    end
+    return hosts
+end
+
+function write_hostlist_to_file(hostlist, nperhost)
+    if hostlist == "" then
+        return {}
+    end
+    outlist = {}
+    list = likwid.stringsplit(hostlist, ",")
+    for i, item in pairs(list) do
+        if not item:match("%[") then
+            table.insert(outlist, item)
+        else
+            prefixzeros = 0
+            
+            host, start, ende,remain = item:match("(%w+)%[(%d+)-(%d+)%]([%w%d%[%]-]*)")
+            if host and start and ende then
+                if tonumber(start) ~= 0 then
+                    for j=1,#start do
+                        if start:sub(j,j+1) == '0' then
+                            prefixzeros = prefixzeros + 1
+                        end
+                    end
+                end
+                if start and ende then
+                    for j=start,ende do
+                        newh = host..string.rep("0", prefixzeros)..tostring(math.tointeger(j))
+                        if remain then
+                            newh = newh .. remain
+                        end
+                        table.insert(outlist, newh)
+                    end
+                end
+            end
+        end
+    end
+    fname = string.format("/tmp/hostlist.%d", likwid.getpid())
+    f = io.open(fname, "w")
+    if f ~= nil then
+        for i=1,#outlist do
+            for j=1, nperhost do
+                f:write(outlist[i].."\n")
+            end
+        end
+        f:close()
+    end
+    return fname
+end
+
+local function writeHostfileSlurm(hostlist, filename)
+    l = {}
+    
+    for i, h in pairs(hostlist) do
+        table.insert(l, h["hostname"])
+    end
+    print("SLURM_NODELIST", table.concat(l,","))
+    likwid.setenv("SLURM_NODELIST", table.concat(l,","))
+end
+
+local function getEnvironmentSlurm()
+    return {}
+end
+
+local function executeSlurm(wrapperscript, hostfile, env, nrNodes)
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+    
+    local exec = string.format("srun -N %d --ntasks-per-node=%d --cpu_bind=none %s", nrNodes, ppn, wrapperscript)
+    if debug then
+        print("EXEC: "..exec)
+    end
+    os.execute(exec)
+end
+local function getNumberOfNodes(hostlist)
+    local n = 0
+    for i, h in pairs(hostlist) do
+        hostname = h["hostname"]
+        exists = false
+        for j=1,i-1 do
+            if hostlist[i]["hostname"] == hostlist[j]["hostname"] then
+                exists = true
+            end
+        end
+        if not exists then
+            n = n + 1
+        end
+    end
+    return n
+end
+
+local function getMpiType()
+    local mpitype = nil
+    if os.getenv("SLURM_JOB_ID") ~= nil then
+        return "slurm"
+    end
+    cmd = "bash -c 'tclsh /apps/modules/modulecmd.tcl sh list -t' 2>&1"
+    local f = io.popen(cmd, 'r')
+    if f == nil then
+        cmd = os.getenv("SHELL").." -c 'module -t list' 2>&1"
+        f = io.popen(cmd, 'r')
+    end
+    if f ~= nil then
+        local s = assert(f:read('*a'))
+        f:close()
+        s = string.gsub(s, '^%s+', '')
+        s = string.gsub(s, '%s+$', '')
+        for i,line in pairs(likwid.stringsplit(s, "\n")) do
+            if line:match("[iI]ntel[mM][pP][iI]") or (line:match("[iI]ntel") and line:match("[mM][pP][iI]")) then
+                mpitype = "intelmpi"
+                --libmpi%a*.so
+            elseif line:match("[oO]pen[mM][pP][iI]") or (line:match("[oO]pen") and line:match("[mM][pP][iI]")) then
+                mpitype = "openmpi"
+                --libmpi.so
+            elseif line:match("mvapich2") then
+                mpitype = "mvapich2"
+                --libmpich.so
+            end
+        end
+    end
+    for i, exec in pairs({"mpiexec.hydra", "mpiexec", "mpirun"}) do
+        f = io.popen(string.format("which %s 2>/dev/null", exec), 'r')
+        if f ~= nil then
+            local s = f:read('*line')
+            if s ~= nil then
+                f:close()
+                f = io.popen(string.format("%s --help 2>/dev/null", s), 'r')
+                if f ~= nil then
+                    out = f:read("*a")
+                    b,e = out:find("Intel")
+                    if (b ~= nil) then
+                        mpitype = "intelmpi"
+                        break
+                    end
+                    b,e = out:find("OpenRTE")
+                    if (b ~= nil) then
+                        mpitype = "openmpi"
+                        break
+                    end
+                    b,e = out:find("MPICH")
+                    if (b ~= nil) then
+                        mpitype = "mvapich2"
+                        break
+                    else
+                        b,e = out:find("MVAPICH2")
+                        if (b ~= nil) then
+                            mpitype = "mvapich2"
+                            break
+                        end
+                    end
+                end
+            end
+        end
+    end
+    if not mpitype then
+        print("WARN: No supported MPI loaded in module system")
+    end
+    return mpitype
+end
+
+local function getMpiExec(mpitype)
+    testing = {}
+    if mpitype == "intelmpi" then
+        testing = {"mpiexec.hydra", "mpiexec"}
+        executeCommand = executeIntelMPI
+        readHostfile = readHostfileIntelMPI
+        writeHostfile = writeHostfileIntelMPI
+        getEnvironment = getEnvironmentIntelMPI
+    elseif mpitype == "openmpi" then
+        testing = {"mpiexec", "mpirun"}
+        executeCommand = executeOpenMPI
+        readHostfile = readHostfileOpenMPI
+        writeHostfile = writeHostfileOpenMPI
+        getEnvironment = getEnvironmentOpenMPI
+    elseif mpitype == "mvapich2" then
+        testing = {"mpiexec", "mpirun"}
+        executeCommand = executeMvapich2
+        readHostfile = readHostfileMvapich2
+        writeHostfile = writeHostfileMvapich2
+        getEnvironment = getEnvironmentMvapich2
+    elseif mpitype == "slurm" then
+        testing = {"srun"}
+        executeCommand = executeSlurm
+        readHostfile = readHostfileSlurm
+        writeHostfile = writeHostfileSlurm
+        getEnvironment = getEnvironmentSlurm
+    end
+    
+    for i, exec in pairs(testing) do
+        f = io.popen(string.format("which %s 2>/dev/null", exec), 'r')
+        if f ~= nil then
+            local s = f:read('*line')
+            if s ~= nil then
+                mpiexecutable = s
+            end
+        end
+    end
+end
+
+local function getOmpType()
+    local cmd = string.format("ldd `which %s` 2>/dev/null", executable[1])
+    local f = io.popen(cmd, 'r')
+    if f ~= nil then
+        cmd = string.format("ldd %s", executable[1])
+        f = io.popen(cmd, 'r')
+    end
+    omptype = nil
+    dyn_linked = true
+    if f ~= nil then
+        local s = f:read('*a')
+        f:close()
+        for i,line in pairs(likwid.stringsplit(s, "\n")) do
+            if line:match("libgomp.so") then
+                omptype = "gnu"
+                break
+            elseif line:match("libiomp%d*.so") then
+                omptype = "intel"
+                break
+            elseif line:match("not a dynamic executable") then
+                omptype = "none"
+                dyn_linked = false
+                break
+            end
+        end
+    end
+    if not omptype and dyn_linked == false then
+        print("WARN: Cannot get OpenMP variant from executable, trying module system")
+        cmd = "bash -c 'tclsh /apps/modules/modulecmd.tcl sh list -t' 2>&1"
+        local f = io.popen(cmd, 'r')
+        if f == nil then
+            cmd = os.getenv("SHELL").." -c 'module -t list' 2>&1"
+            f = io.popen(cmd, 'r')
+        end
+        if f ~= nil then
+            local s = f:read('*a')
+            f:close()
+            s = string.gsub(s, '^%s+', '')
+            s = string.gsub(s, '%s+$', '')
+            for i,line in pairs(likwid.stringsplit(s, "\n")) do
+                if line:match("[iI]ntel") or line:match("[iI][cC][cC]") then
+                    omptype = "intel"
+                elseif line:match("[gG][nN][uU]") or line:match("[gG][cC][cC]") then
+                    omptype = "gnu"
+                end
+            end
+        end
+        if not omptype then
+            print("WARN: No supported OpenMP loaded in module system")
+        end
+    end
+    if omptype == "none" then
+        return nil
+    end
+    return omptype
+end
+
+local function assignHosts(hosts, np, ppn)
+    tmp = np
+    newhosts = {}
+    current = 0
+    if debug then
+        print(string.format("Assign %d processes with %d per node to %d hosts", np, ppn, #hosts))
+        print("Available hosts for scheduling:")
+        print("Host", "Slots", "MaxSlots", "Interface")
+        for i, h in pairs(hosts) do
+            print (h["hostname"], h["slots"], h["maxslots"],"", h["interface"])
+        end
+    end
+    local break_while = false
+    while tmp > 0 and #hosts > 0 do
+        for i, host in pairs(hosts) do
+            if host["slots"] and host["slots"] >= ppn then
+                if host["maxslots"] and host["maxslots"] < ppn then
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=host["maxslots"],
+                                            maxslots=host["maxslots"],
+                                            interface=host["interface"]})
+                    if debug then
+                        print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], host["maxslots"]))
+                    end
+                    current = host["maxslots"]
+                    hosts[i] = nil
+                else
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=ppn,
+                                            maxslots=host["slots"],
+                                            interface=host["interface"]})
+                    if debug then
+                        print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+                    end
+                    current = ppn
+                    hosts[i] = nil
+                end
+            elseif host["slots"] then
+                if host["maxslots"] then
+                    if host["maxslots"] < ppn then
+                        print(string.format("WARN: Oversubscription for host %s needed, but max-slots set to %d.",
+                                                host["hostname"], host["maxslots"]))
+                        table.insert(newhosts, {hostname=host["hostname"],
+                                                slots=host["maxslots"],
+                                                maxslots=host["maxslots"],
+                                                interface=host["interface"]})
+                        current = host["maxslots"]
+                        host["maxslots"] = 0
+                        hosts[i] = nil
+                    else
+                        print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                        table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=ppn,
+                                            maxslots=host["maxslots"],
+                                            interface=host["interface"]})
+                        current = ppn
+                        
+                    end
+                else
+                    print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                        slots=ppn,
+                                        maxslots=host["slots"],
+                                        interface=host["interface"]})
+                    current = ppn
+                end
+            else
+                table.insert(newhosts, {hostname=host["hostname"],
+                                        slots=ppn,
+                                        maxslots=host["slots"],
+                                        interface=host["interface"]})
+                if debug then
+                    print(string.format("DEBUG: Add Host %s with %d slots to host list", host["hostname"], ppn))
+                end
+                current = ppn
+            end
+            tmp = tmp - current
+            if tmp < 1 then
+                break_while = true
+                break
+            elseif tmp < ppn then
+                ppn = tmp
+            end
+        end
+        if break_while then
+            break
+        end
+    end
+    for i=1, #newhosts do
+        if newhosts[i] then
+            for j=i+1,#newhosts do
+                if newhosts[j] then
+                    if newhosts[i]["hostname"] == newhosts[j]["hostname"] then
+                        newhosts[i]["slots"] = newhosts[i]["slots"] + newhosts[j]["slots"]
+                        if newhosts[i]["maxslots"] ~= nil and newhosts[j]["maxslots"] ~= nil then
+                            newhosts[i]["maxslots"] = newhosts[i]["maxslots"] + newhosts[j]["maxslots"]
+                        end
+                        if newhosts[i]["slots"] > ppn then
+                            ppn = newhosts[i]["slots"]
+                        end
+                        table.remove(newhosts, j)
+                    end
+                end
+            end
+        end
+    end
+    if debug then
+        print("DEBUG: Scheduling on hosts:")
+        for i, h in pairs(newhosts) do
+            if h["maxslots"] ~= nil then
+                str = string.format("DEBUG: Host %s with %d slots (max. %d slots)",
+                                h["hostname"],h["slots"],h["maxslots"])
+            else
+                str = string.format("DEBUG: Host %s with %d slots", h["hostname"],h["slots"])
+            end
+            if h["interface"] then
+                str = str.. string.format(" using interface %s", h["interface"])
+            end
+            print(str)
+        end
+    end
+    return newhosts, ppn
+end
+
+local function calculatePinExpr(cpuexprs)
+    local newexprs = {}
+    for i, expr in pairs(cpuexprs) do
+        local strList = {}
+        amount, list = likwid.cpustr_to_cpulist(expr)
+        for _, c in pairs(list) do
+            table.insert(strList, c)
+        end
+        table.insert(newexprs, table.concat(strList,","))
+    end
+    return newexprs
+end
+
+local function calculateCpuExprs(nperdomain, cpuexprs)
+    local topo = likwid.getCpuTopology()
+    local affinity = likwid.getAffinityInfo()
+    local domainlist = {}
+    local newexprs = {}
+    domainname, count = nperdomain:match("[E:]*(%g*):(%d+)")
+
+    for i, domain in pairs(affinity["domains"]) do
+        if domain["tag"]:match(domainname.."%d*") then
+            table.insert(domainlist, i)
+        end
+    end
+    if debug then
+        local str = "DEBUG: NperDomain string "..nperdomain.." covers the domains: "
+        for i, idx in pairs(domainlist) do
+            str = str .. affinity["domains"][idx]["tag"] .. " "
+        end
+        print(str)
+    end
+
+    for i, domidx in pairs(domainlist) do
+        local sortedlist = {}
+        for off=1,topo["numThreadsPerCore"] do
+            for i=0,affinity["domains"][domidx]["numberOfProcessors"]/topo["numThreadsPerCore"] do
+                table.insert(sortedlist, affinity["domains"][domidx]["processorList"][off + (i*topo["numThreadsPerCore"])])
+            end
+        end
+        local tmplist = {}
+        for j=1,count do
+            table.insert(newexprs, tostring(sortedlist[1]))
+            table.remove(sortedlist, 1)
+        end
+    end
+    if debug then
+        local str = "DEBUG: Resolved NperDomain string "..nperdomain.." to CPUs: "
+        for i, expr in pairs(newexprs) do
+            str = str .. expr .. " "
+        end
+        print(str)
+    end
+    return newexprs
+end
+
+local function createEventString(eventlist)
+    if eventlist == nil or #eventlist == 0 then
+        print("ERROR: Empty event list. Failed to create event set string")
+        return ""
+    end
+    local str = ""
+    if eventlist[1] ~= nil and eventlist[1]["Event"] ~= nil and eventlist[1]["Counter"] ~= nil then
+        str = str .. eventlist[1]["Event"]..":"..eventlist[1]["Counter"]
+    end
+    for i=2,#eventlist do
+        if eventlist[i] ~= nil and eventlist[i]["Event"] ~= nil and eventlist[i]["Counter"] ~= nil then
+            str = str .. ","..eventlist[i]["Event"]..":"..eventlist[i]["Counter"]
+        end
+    end
+    return str
+end
+
+local function setPerfStrings(perflist, cpuexprs)
+    local uncore = false
+    local perfexprs = {}
+    local grouplist = {}
+    local cpuinfo = likwid.getCpuInfo()
+    local affinity = likwid.getAffinityInfo()
+    local socketList = {}
+    local socketListFlags = {}
+    for i, d in pairs(affinity["domains"]) do
+        if d["tag"]:match("S%d+") then
+            local tmpList = {}
+            for j,cpu in pairs(d["processorList"]) do
+                table.insert(tmpList, cpu)
+            end
+            table.insert(socketList, tmpList)
+            table.insert(socketListFlags, 1)
+        end
+    end
+
+    for k, perfStr in pairs(perflist) do
+        local coreevents = {}
+        local uncoreevents = {}
+        local gdata = nil
+        gdata = likwid.get_groupdata(perfStr)
+        if gdata == nil then
+            print("Cannot get data for group "..perfStr..". Skipping...")
+        else
+            table.insert(grouplist, gdata)
+            if perfexprs[k] == nil then
+                perfexprs[k] = {}
+            end
+
+            for i, e in pairs(gdata["Events"]) do
+                if  not e["Counter"]:match("FIXC%d") and
+                    not e["Counter"]:match("^PMC%d") and
+                    not e["Counter"]:match("TMP%d") then
+                    table.insert(uncoreevents, e)
+                else
+                    table.insert(coreevents, e)
+                end
+            end
+            
+            local tmpSocketFlags = {}
+            for _,e in pairs(socketListFlags) do
+                table.insert(tmpSocketFlags, e)
+            end
+
+            for i,cpuexpr in pairs(cpuexprs) do
+                for j, cpu in pairs(likwid.stringsplit(cpuexpr,",")) do
+                    local uncore = false
+                    for sidx, socket in pairs(socketList) do
+                        local switchedFlag = false
+                        for _,c in pairs(socket) do
+                            if c == tonumber(cpu) then
+                                if tmpSocketFlags[sidx] == 1 then
+                                    local eventStr = createEventString(coreevents)
+                                    if #uncoreevents > 0 then
+                                        eventStr = eventStr .. ","..createEventString(uncoreevents)
+                                    end
+                                    table.insert(perfexprs[k], eventStr)
+                                    tmpSocketFlags[sidx] = 0
+                                    switchedFlag = true
+                                    uncore = true
+                                    break
+                                else
+                                    table.insert(perfexprs[k], createEventString(coreevents))
+                                    switchedFlag = true
+                                    uncore = true
+                                end
+                            end
+                        end
+                        if switchedFlag then break end
+                    end
+                    if uncore then break end
+                end
+            end
+
+            if debug then
+                for i, expr in pairs(perfexprs[k]) do
+                    print(string.format("DEBUG: Process %d measures with event set: %s", i-1, expr))
+                end
+            end
+        end
+    end
+    return perfexprs, grouplist
+end
+
+local function checkLikwid()
+    local f = io.popen("which likwid-pin 2>/dev/null", "r")
+    if f ~= nil then
+        local s = f:read("*line")
+        if s ~= nil and s ~= LIKWID_PIN then
+            LIKWID_PIN = s
+        end
+        f:close()
+    end
+    f = io.popen("which likwid-perfctr 2>/dev/null", "r")
+    if f ~= nil then
+        local s = f:read("*line")
+        if s ~= nil and s ~= LIKWID_PERFCTR then
+            LIKWID_PERFCTR = s
+        end
+        f:close()
+    end
+end
+
+local function writeWrapperScript(scriptname, execStr, hosts, outputname)
+    if scriptname == nil or scriptname == "" then
+        return
+    end
+    local oversubscripted = {}
+    local commands = {}
+    tmphosts = {}
+    for i, host in pairs(hosts) do
+        if tmphosts[host["hostname"]] ~= nil then
+            tmphosts[host["hostname"]] = tmphosts[host["hostname"]] + host["slots"]
+        else
+            tmphosts[host["hostname"]] = host["slots"]
+        end
+    end
+
+    if mpitype == "openmpi" then
+        glsize_var = "$OMPI_COMM_WORLD_SIZE"
+        glrank_var = "${OMPI_COMM_WORLD_RANK:-$(($GLOBALSIZE * 2))}"
+        losize_var = "$OMPI_COMM_WORLD_LOCAL_SIZE"
+    elseif mpitype == "intelmpi" then
+        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glsize_var = tostring(math.tointeger(np))
+        losize_var = tostring(math.tointeger(ppn))
+    elseif mpitype == "mvapich2" then
+        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glsize_var = tostring(math.tointeger(np))
+        losize_var = tostring(math.tointeger(ppn))
+    elseif mpitype == "slurm" then
+        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glsize_var = tostring(math.tointeger(np))
+        losize_var = "$MPI_LOCALNRANKS"
+    else
+        print("Invalid MPI vendor "..mpitype)
+        return
+    end
+
+    local taillength = np % ppn
+    if taillength ~= 0 then
+        local full = tostring(math.tointeger(np -taillength))
+        table.insert(oversubscripted, "if [ $GLOBALRANK >= "..tostring(math.tointeger(full)).." ]; then\n")
+        table.insert(oversubscripted, "\tLOCALRANK=$($GLOBALRANK - "..tostring(math.tointeger(full))..")\n")
+        table.insert(oversubscripted, "fi\n")
+    end
+
+    local f = io.open(scriptname, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..scriptname)
+        os.exit(1)
+    end
+
+    if outputname:sub(1,1) ~= "/" then
+        outputname = os.getenv("PWD").."/"..outputname
+    end
+
+    for i=1,#cpuexprs do
+        local cmd = {}
+        local cpuexpr_opt = "-c"
+        if #perf > 0 then
+            table.insert(cmd,LIKWID_PERFCTR)
+            if use_marker then
+                table.insert(cmd,"-m")
+            end
+            cpuexpr_opt = "-C"
+        else
+            table.insert(cmd,LIKWID_PIN)
+            table.insert(cmd,"-q")
+        end
+        if force and #perf > 0 then
+            table.insert(cmd,"-f")
+        end
+        table.insert(cmd,skipStr)
+        table.insert(cmd,cpuexpr_opt)
+        table.insert(cmd,cpuexprs[i])
+        if #perf > 0 then
+            for j, expr in pairs(perfexprs) do
+                table.insert(cmd,"-g")
+                table.insert(cmd,expr[i])
+            end
+            table.insert(cmd,"-o")
+            table.insert(cmd,outputname)
+        end
+        table.insert(cmd,execStr)
+        commands[i] = table.concat(cmd, " ")
+    end
+
+    f:write("#!/bin/bash -l\n")
+    f:write("GLOBALSIZE="..glsize_var.."\n")
+    f:write("GLOBALRANK="..glrank_var.."\n")
+    f:write("unset OMP_NUM_THREADS\n")
+    if mpitype == "intelmpi" then
+        f:write("export I_MPI_PIN=disable\n")
+    end
+    f:write("LOCALSIZE="..losize_var.."\n\n")
+
+    if mpitype == "openmpi" then
+        f:write("LOCALRANK=$OMPI_COMM_WORLD_LOCAL_RANK\n\n")
+    elseif mpitype  == "slurm" then
+        f:write("LOCALRANK=$MPI_LOCALRANKID\n\n")
+    else
+        local full = tostring(math.tointeger(np - (np % ppn)))
+        f:write("if [ \"$GLOBALRANK\" -lt "..tostring(math.tointeger(full)).." ]; then\n")
+        f:write("\tLOCALRANK=$(($GLOBALRANK % $LOCALSIZE))\n")
+        f:write("else\n")
+        f:write("\tLOCALRANK=$(($GLOBALRANK - ("..tostring(math.tointeger(full)).." - 1)))\n")
+        f:write("fi\n\n")
+    end
+
+    if #perf > 0 then
+        f:write("which `basename "..LIKWID_PERFCTR.."` 1>/dev/null 2>&1\n")
+    else
+        f:write("which `basename "..LIKWID_PIN.."` 1>/dev/null 2>&1\n")
+    end
+    f:write("if [ $? -eq 1 ]; then\n")
+    f:write("\tmodule load likwid 1>/dev/null 2>&1\n")
+    f:write("fi\n\n")
+
+    f:write("if [ \"$LOCALRANK\" -eq 0 ]; then\n")
+    if debug then
+        print("NODE_EXEC: "..commands[1])
+    end
+    f:write("\t"..commands[1].."\n")
+
+    for i=2,#commands do
+        f:write("elif [ \"$LOCALRANK\" -eq "..tostring(i-1).." ]; then\n")
+        if debug then
+            print("NODE_EXEC: "..commands[i])
+        end
+        f:write("\t"..commands[i].."\n")
+    end
+    f:write("else\n")
+    f:write("\techo \"Unknown local rank $LOCALRANK\"\n")
+    f:write("fi\n")
+    
+    f:close()
+    os.execute("chmod +x "..scriptname)
+end
+
+
+local function listdir(dir, infilepart)
+    local outlist = {}
+    local p = io.popen("find "..dir.." -maxdepth 1 -type f -name \"*"..infilepart.."*\"")
+    for file in p:lines() do
+        table.insert(outlist, file)
+    end
+    p:close()
+    if #outlist > 0 then
+        table.sort(outlist)
+    end
+    return outlist
+end
+
+
+local function parseOutputFile(filename)
+    local rank = 0
+    local host = nil
+    local cpulist = {}
+    local eventlist = {}
+    local counterlist = {}
+    local idx = 1
+    local gidx = 0
+    local results = {}
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open output file "..filename)
+        os.exit(1)
+    end
+    rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+
+    local t = f:read("*all")
+    f:close()
+    if t:len() == 0 then
+        print("Error Output file "..filename.." is empty")
+        os.exit(1)
+    end
+    for i, line in pairs(likwid.stringsplit(t, "\n")) do
+        if (not line:match("^-")) and
+           (not line:match("^CPU type:")) and
+           (not line:match("^CPU name:")) and
+           (not line:match("^TABLE")) and
+           (not line:match("^STRUCT")) and
+           (not line:match("^%s*$")) and
+           (not line:match("STAT")) then
+            if line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                table.remove(linelist,1)
+                for _, cpustr in pairs(linelist) do
+                    local test = tonumber(cpustr:match("Core (%d+)"))
+                    if test ~= nil then
+                        for _cpu in pairs(cpulist) do
+                            if tonumber(cpu) == test then test = -1 end
+                        end
+                        if test >= 0 then
+                            table.insert(cpulist, test)
+                        end
+                    end
+                end
+                gidx = gidx + 1
+                idx = 1
+                if results[gidx] == nil then
+                    results[gidx] = {}
+                    eventlist[gidx] = {}
+                    counterlist[gidx] = {}
+                    results[gidx]["time"] = {}
+                end
+            elseif not line:match("^CPU clock:") and not line:match("Sum,Min,Max,Avg") then
+                linelist = likwid.stringsplit(line,",")
+                event = linelist[1]
+                counter = linelist[2]
+                table.remove(linelist,1)
+                table.remove(linelist,1)
+                for j=#linelist,1,-1 do
+                    if linelist[j] == "" then
+                        table.remove(linelist, j)
+                    end
+                end
+                if results[gidx][idx] == nil then
+                    results[gidx][idx] = {}
+                end
+                for j, value in pairs(linelist) do
+                    if event:match("[Rr]untime") then
+                        results[gidx]["time"][cpulist[j]] = tonumber(value)
+                    else
+                        results[gidx][idx][cpulist[j]] = tonumber(value)
+                    end
+                end
+                if not event:match("[Rr]untime") then
+                    table.insert(eventlist[gidx], idx, event)
+                    table.insert(counterlist[gidx], idx, counter)
+                    idx = idx + 1
+                end
+            elseif line:match("^CPU clock:") then
+                results["clock"] = line:match("^CPU clock:,([%d.]+)")
+                results["clock"] = tonumber(results["clock"])*1.E09
+            end
+        end
+    end
+    return host, tonumber(rank), results, cpulist
+end
+
+local function parseMarkerOutputFile(filename)
+    local rank = 0
+    local host = nil
+    local cpulist = {}
+    local eventlist = {}
+    local counterlist = {}
+    local idx = 1
+    
+    local results = {}
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open output file "..filename)
+        os.exit(1)
+    end
+    rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+    local t = f:read("*all")
+    f:close()
+    local parse_reg_info = false
+    local parse_reg_output = false
+    local current_region = nil
+    local gidx = 0
+    local gname = ""
+    local clock = 0
+
+    for i, line in pairs(likwid.stringsplit(t, "\n")) do
+        if (not line:match("^-")) and
+           (not line:match("^CPU type:")) and
+           (not line:match("^CPU name:")) and
+           (not line:match("STAT")) then
+
+            if line:match("^STRUCT,Info") and not parse_reg_info then
+                parse_reg_info = true
+            elseif line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+                parse_reg_info = false
+                parse_reg_output = true
+                idx = 1
+            elseif line:match("^Event") and line:match("Sum,Min,Max,Avg") then
+                parse_reg_output = false
+            elseif line:match("^CPU clock:,") then
+                clock = line:match("^CPU clock:,([%d.]+)")
+                clock = tonumber(clock)*1.E09
+            elseif parse_reg_info and line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),") then
+                current_region, gidx, gname  = line:match("TABLE,Region (%g+),Group (%d+) Raw,(%g+),")
+                gidx = tonumber(gidx)+1
+                if results[current_region] == nil then
+                    results[current_region] = {}
+                end
+                if results[current_region][gidx] == nil then
+                    results[current_region][gidx] = {}
+                    results[current_region][gidx]["name"] = gname
+                    results[current_region][gidx]["time"] = {}
+                    results[current_region][gidx]["calls"] = {}
+                end
+            elseif parse_reg_info and line:match("^Region Info") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for _, cpustr in pairs(linelist) do
+                    if cpustr:match("Core %d+") then
+                        local test = tonumber(cpustr:match("Core (%d+)"))
+                        if test ~= nil then
+                            for _,cpu in pairs(cpulist) do
+                                if test == cpu then test = -1 end
+                            end
+                            if test >= 0 then
+                                table.insert(cpulist, test)
+                            end
+                        end
+                    end
+                end
+            elseif parse_reg_info and line:match("^RDTSC") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for i, time in pairs(linelist) do
+                    if time ~= "" then
+                        results[current_region][gidx]["time"][cpulist[i]] = tonumber(time)
+                    end
+                end
+            elseif parse_reg_info and line:match("^call count") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for j, calls in pairs(linelist) do
+                    if calls:match("%d+") then
+                        if calls ~= "" then
+                            results[current_region][gidx]["calls"][cpulist[j]] = tonumber(calls)
+                        end
+                    end
+                end
+            elseif parse_reg_output then
+                linelist = likwid.stringsplit(line,",")
+                if linelist[2] ~= "TSC" then
+                    table.remove(linelist,1)
+                    table.remove(linelist,1)
+                    for j=#linelist,1,-1 do
+                        if linelist[j] == "" then
+                            table.remove(linelist, j)
+                        end
+                    end
+                    if results[current_region][gidx][idx] == nil then
+                        results[current_region][gidx][idx] = {}
+                    end
+                    for j, value in pairs(linelist) do
+                        results[current_region][gidx][idx][cpulist[j]] = tonumber(value)
+                    end
+                    idx = idx + 1
+                end
+            end
+        end
+    end
+    for region, data in pairs(results) do
+        results[region]["clock"] = clock
+    end
+
+    return host, tonumber(rank), results, cpulist
+end
+
+
+function percentile_table(inputtable, skip_cols, skip_lines)
+    local function percentile(sorted_valuelist, k)
+        index = tonumber(k)/100.0 * #sorted_valuelist
+        if index - math.floor(index) >= 0.5 then
+            index = math.ceil(index)
+        else
+            index = math.floor(index)
+        end
+        return tonumber(sorted_valuelist[index])
+    end
+    local outputtable = {}
+    local ncols = #inputtable
+    if ncols == 0 then
+        return outputtable
+    end
+    local nlines = #inputtable[1]
+    if nlines == 0 then
+        return outputtable
+    end
+    perc25 = {"%ile 25"}
+    perc50 = {"%ile 50"}
+    perc75 = {"%ile 75"}
+    for i=skip_lines+1,nlines do
+        perc25[i-skip_lines+1] = 0
+        perc50[i-skip_lines+1] = 0
+        perc75[i-skip_lines+1] = 0
+    end
+    for l=skip_lines+1,nlines do
+        valuelist = {}
+        for c=skip_cols+1, ncols do
+            table.insert(valuelist, inputtable[c][l])
+        end
+        table.sort(valuelist)
+        perc25[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 25))
+        perc50[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 50))
+        perc75[l-skip_lines+1] = likwid.num2str(percentile(valuelist, 75))
+    end
+    table.insert(outputtable, perc25)
+    table.insert(outputtable, perc50)
+    table.insert(outputtable, perc75)
+    return outputtable
+end
+
+function printMpiOutput(group_list, all_results, regionname)
+    region = regionname or nil
+    if #group_list == 0 or likwid.tablelength(all_results) == 0 then
+        return
+    end
+    for gidx, gdata in pairs(group_list) do
+        local firsttab = {}
+        local firsttab_combined = {}
+        local secondtab = {}
+        local secondtab_combined = {}
+        local total_threads = 0
+        local all_counters = {}
+        for rank = 0, #all_results do
+            total_threads = total_threads + #all_results[rank]["cpus"]
+        end
+
+        desc = {"Event"}
+        if total_threads == 1 or not gdata["Metrics"] then
+            table.insert(desc, "Runtime (RDTSC) [s]")
+        end
+        if all_results[0]["results"][1]["calls"] then
+            table.insert(desc, "Region calls")
+        end
+        for i=1,#gdata["Events"] do
+            table.insert(desc, gdata["Events"][i]["Event"])
+        end
+        table.insert(firsttab, desc)
+
+        desc = {"Counter"}
+        if total_threads == 1 or not gdata["Metrics"] then
+            table.insert(desc, "TSC")
+        end
+        if all_results[0]["results"][1]["calls"] then
+            table.insert(desc, "CTR")
+        end
+        for i=1,#gdata["Events"] do
+            table.insert(desc, gdata["Events"][i]["Counter"])
+        end
+        table.insert(firsttab, desc)
+
+        for rank = 0, #all_results do
+            for i, cpu in pairs(all_results[rank]["cpus"]) do
+                column = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+                if total_threads == 1 or not gdata["Metrics"] then
+                    table.insert(column, all_results[rank]["results"][gidx]["time"][cpu])
+                end
+                if all_results[0]["results"][1]["calls"] then
+                    table.insert(column, all_results[rank]["results"][gidx]["calls"][cpu])
+                end
+                for j=1,#gdata["Events"] do
+                    local value = "0"
+                    if all_results[rank]["results"][gidx][j] and
+                       all_results[rank]["results"][gidx][j][cpu] then
+                        value = likwid.num2str(all_results[rank]["results"][gidx][j][cpu])
+                    end
+                    table.insert(column, value)
+                end
+                table.insert(firsttab, column)
+            end
+        end
+
+        if total_threads > 1 then
+            firsttab_combined = likwid.tableToMinMaxAvgSum(firsttab, 2, 1)
+        end
+        if gdata["Metrics"] then
+            secondtab[1] = {"Metric"}
+            for j=1,#gdata["Metrics"] do
+                table.insert(secondtab[1], gdata["Metrics"][j]["description"])
+            end
+
+            for rank = 0, #all_results do
+                for i, cpu in pairs(all_results[rank]["cpus"]) do
+                    local counterlist = {}
+                    for j=1,#gdata["Events"] do
+                        local counter = gdata["Events"][j]["Counter"]
+                        counterlist[counter] = 0
+                        if all_results[rank]["results"][gidx][j] ~= nil and
+                           all_results[rank]["results"][gidx][j][cpu] ~= nil then
+                            counterlist[counter] = all_results[rank]["results"][gidx][j][cpu]
+                        end
+                    end
+                    counterlist["time"] = all_results[rank]["results"][gidx]["time"][cpu]
+                    counterlist["inverseClock"] = 1.0/all_results[rank]["results"]["clock"]
+                    tmpList = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+                    for j=1,#groupdata["Metrics"] do
+                        local tmp = likwid.num2str(likwid.calculate_metric(gdata["Metrics"][j]["formula"], counterlist))
+                        table.insert(tmpList, tmp)
+                    end
+                    table.insert(secondtab,tmpList)
+                end
+            end
+
+            if total_threads > 1 then
+                secondtab_combined = likwid.tableToMinMaxAvgSum(secondtab, 1, 1)
+                local tmp = percentile_table(secondtab, 1, 1)
+                for i, col in pairs(tmp) do
+                    table.insert(secondtab_combined, col)
+                end
+            end
+        end
+        if use_csv then
+            local maxLineFields = #firsttab
+            if #firsttab_combined > maxLineFields then maxLineFields = #firsttab_combined end
+            if gdata["Metrics"] then
+                if #secondtab > maxLineFields then maxLineFields = #secondtab end
+                if #secondtab_combined > maxLineFields then maxLineFields = #secondtab_combined end
+            end
+            if region then
+                print("Region,"..tostring(region).. string.rep(",", maxLineFields  - 2))
+            end
+            print("Group,"..tostring(gidx) .. string.rep(",", maxLineFields  - 2))
+            likwid.printcsv(firsttab, maxLineFields)
+            if total_threads > 1 then likwid.printcsv(firsttab_combined, maxLineFields) end
+            if gdata["Metrics"] then
+                likwid.printcsv(secondtab, maxLineFields)
+                if total_threads > 1 then likwid.printcsv(secondtab_combined, maxLineFields) end
+            end
+        else
+            if region then
+                print("Region: "..tostring(region))
+            end
+            print("Group: "..tostring(gidx))
+            likwid.printtable(firsttab)
+            if total_threads > 1 then likwid.printtable(firsttab_combined) end
+            if gdata["Metrics"] then
+                likwid.printtable(secondtab)
+                if total_threads > 1 then likwid.printtable(secondtab_combined) end
+            end
+        end
+    end
+end
+
+
+
+function cpuCount()
+    cputopo = likwid.getCpuTopology()
+    local cpus = cputopo["activeHWThreads"]
+    return cpus
+end
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:","h","help","v","g:","group:","mpi:","omp:","d","m","O","debug","marker","version","s:","skip:","f"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-")
+        if s == 1 then
+            print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("ERROR: Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version"then
+        version()
+        os.exit(0)
+    elseif opt == "d" or opt == "debug" then
+        debug = true
+    elseif opt == "m" or opt == "marker" then
+        use_marker = true
+    elseif opt == "O" then
+        use_csv = true
+    elseif opt == "f" then
+        force = true
+    elseif opt == "n" or opt == "np" then
+        np = tonumber(arg)
+        if np == nil then
+            print("Argument for -n/-np must be a number")
+            os.exit(1)
+        end
+    elseif opt == "nperdomain" then
+        nperdomain = arg
+        local domain, count = nperdomain:match("([NSCM]%d*):(%d+)")
+        if domain == nil then
+            print("Invalid option to -nperdomain")
+            os.exit(1)
+        end
+    elseif opt == "hostfile" then
+        hostfile = arg
+    elseif opt == "pin" then
+        cpuexprs = likwid.stringsplit(arg, "_")
+    elseif opt == "g" or opt == "group" then
+        table.insert(perf, arg)
+    elseif opt == "mpi" then
+        mpitype = arg
+    elseif opt == "omp" then
+        omptype = arg
+    elseif opt == "s" or opt == "skip" then
+        skipStr = "-s "..arg
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+
+if np == 0 and nperdomain == nil and #cpuexprs == 0 then
+    print("ERROR: No option -n/-np, -nperdomain or -pin")
+    os.exit(1)
+end
+
+if use_marker and #perf == 0 then
+    print("ERROR: You selected the MarkerAPI feature but didn't set any events on the commandline")
+    os.exit(1)
+end
+
+for i=1,#arg do
+    table.insert(executable, arg[i])
+end
+if #executable == 0 then
+    print("ERROR: No executable given on commandline")
+    os.exit(1)
+elseif os.execute(string.format("ls %s 1>/dev/null 2>&1", executable[1])) == 0 then
+    print("ERROR: Cannot find executable given on commandline")
+    os.exit(1)
+else
+    local f = io.popen(string.format("which %s 2>/dev/null", executable[1]))
+    if f ~= nil then
+        executable[1] = f:read("*line")
+        f:close()
+    end
+    if debug then
+        print("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
+    end
+end
+
+if mpitype == nil then
+    mpitype = getMpiType()
+    if debug then
+        print("DEBUG: Using MPI implementation "..mpitype)
+    end
+end
+if mpitype ~= "intelmpi" and mpitype ~= "mvapich2" and mpitype ~= "openmpi" and mpitype ~= "slurm" then
+    print("ERROR: Cannot determine current MPI implementation. likwid-mpirun checks for openmpi, intelmpi and mvapich2 or if running in a SLURM environment")
+    os.exit(1)
+end
+
+getMpiExec(mpitype)
+if (mpiexecutable == nil) then
+    print(string.format("Cannot find executable for determined MPI implementation %s", mpitype))
+    os.exit(1)
+end
+
+if omptype == nil then
+    omptype = getOmpType()
+    if debug and omptype ~= nil then
+        print("DEBUG: Using OpenMP implementation "..omptype)
+    end
+end
+if omptype == nil then
+    print("WARN: Cannot extract OpenMP vendor from executable or commandline, assuming no OpenMP")
+end
+
+if not hostfile then
+    if os.getenv("PBS_NODEFILE") ~= nil then
+        hostfile = os.getenv("PBS_NODEFILE")
+        hosts = readHostfilePBS(hostfile)
+    elseif os.getenv("LOADL_HOSTFILE") ~= nil then
+        hostfile = os.getenv("LOADL_HOSTFILE")
+        hosts = readHostfilePBS(hostfile)
+    elseif mpitype == "slurm" and os.getenv("SLURM_NODELIST") ~= nil then
+        hostlist = os.getenv("SLURM_NODELIST")
+        hosts = readHostfileSlurm(hostlist)
+    else
+        local cpus = cpuCount()
+        table.insert(hosts, {hostname='localhost', slots=cpus, maxslots=cpus})
+    end
+else
+    hosts = readHostfile(hostfile)
+end
+
+local givenNrNodes = getNumberOfNodes(hosts)
+
+if skipStr == "" then
+    if mpitype == "intelmpi" then
+        if omptype == "intel" and givenNrNodes > 1 then
+            skipStr = '-s 0x3'
+        elseif omptype == "intel" and givenNrNodes == 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and givenNrNodes > 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and givenNrNodes == 1 then
+            skipStr = '-s 0x0'
+        end
+    elseif mpitype == "mvapich2" then
+        if omptype == "intel" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        end
+    elseif mpitype == "openmpi" then
+        if omptype == "intel" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "intel" and givenNrNodes == 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "gnu" and givenNrNodes == 1 then
+            skipStr = '-s 0x0'
+        end
+    end
+end
+if debug and skipStr ~= "" then
+    print(string.format("DEBUG: Using skip option %s to skip pinning of shepard threads", skipStr))
+end
+
+if #perf > 0 then
+    local sum_maxslots = 0
+    local topo = likwid.getCpuTopology()
+    if debug then
+        print("DEBUG: Switch to perfctr mode, there are "..tostring(#perf).." eventsets given on the commandline")
+    end
+    for i, host in pairs(hosts) do
+        if debug then
+            local str = string.format("DEBUG: Working on host %s with %d slots", host["hostname"], host["slots"])
+            if host["maxslots"] ~= nil then
+                str = str .. string.format(" and %d slots maximally", host["maxslots"])
+            end
+            print(str)
+        end
+        if host["maxslots"] ~= nil then
+            sum_maxslots = sum_maxslots + host["maxslots"]
+        elseif host["slots"] ~= nil then
+            sum_maxslots = sum_maxslots + host["slots"]
+        else
+            sum_maxslots = sum_maxslots + topo["numHWThreads"]
+            host["slots"] = topo["numHWThreads"]
+        end
+    end
+    if np > sum_maxslots then
+        print("ERROR: Processes requested exceeds maximally available slots of given hosts. Maximal processes: "..sum_maxslots)
+        os.exit(1)
+    end
+end
+
+if #cpuexprs > 0 then
+    cpuexprs = calculatePinExpr(cpuexprs)
+    likwid.tableprint(cpuexprs)
+    print(#cpuexprs)
+    ppn = #cpuexprs
+    if np == 0 then
+        if debug then
+            print(string.format("DEBUG: No -np given , setting according to pin expression and number of available hosts"))
+        end
+        np = givenNrNodes * #cpuexprs
+        ppn = #cpuexprs
+    elseif np < #cpuexprs*givenNrNodes then
+        while np < #cpuexprs*givenNrNodes and #cpuexprs > 1 do
+            print("remove")
+            table.remove(cpuexprs)
+        end
+        ppn = #cpuexprs
+    end
+    newhosts = assignHosts(hosts, np, ppn)
+    if np > #cpuexprs*#newhosts and #perf > 0 then
+        print("ERROR: Oversubsribing not allowed.")
+        print(string.format("ERROR: You want %d processes but the pinning expression has only expressions for %d processes. There are only %d hosts in the host list.", np, #cpuexprs*#newhosts, #newhosts))
+        os.exit(1)
+    end
+elseif nperdomain ~= nil then
+    cpuexprs = calculateCpuExprs(nperdomain, cpuexprs)
+    ppn = #cpuexprs
+    if np == 0 then
+        np = givenNrNodes * ppn
+    end
+    if np < ppn then
+        if debug then
+            print("WARN: Removing additional cpu expressions to get requested number of processes")
+        end
+        for i=np+1,ppn do
+            if debug then
+                print("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
+            end
+            table.remove(cpuexprs, #cpuexprs)
+        end
+        ppn = np
+    elseif np > (givenNrNodes * ppn) and #perf > 0 then
+        print("ERROR: Oversubsribing nodes not allowed!")
+        print(string.format("ERROR: You want %d processes with %d on each of the %d hosts", np, ppn, givenNrNodes))
+        os.exit(1)
+    end
+    newhosts, ppn = assignHosts(hosts, np, ppn)
+elseif ppn == 0 and np > 0 then
+    maxnp = 0
+    maxppn = 0
+    for i, host in pairs(hosts) do
+        maxnp = maxnp + host["slots"]
+        if host["slots"] > maxppn then
+            maxppn = host["slots"]
+        end
+    end
+    
+    if ppn == 0 then
+        ppn = 1
+    end
+    if ppn > maxppn and np > maxppn then
+        ppn = maxppn
+    elseif np < maxppn then
+        ppn = np
+    elseif maxppn == np then
+        ppn = maxppn
+    end
+    if (ppn * givenNrNodes) < np then
+        if #perf == 0 then
+            print("ERROR: Processes cannot be equally distributed")
+            print(string.format("WARN: You want %d processes on %d hosts.", np, givenNrNodes))
+            ppn = np/givenNrNodes
+            print(string.format("WARN: Sanitizing number of processes per node to %d", ppn))
+        else
+            ppn = 0
+            os.exit(1)
+        end
+    end
+    local newexprs = calculateCpuExprs("E:N:"..tostring(ppn), cpuexprs)
+    local copynp = np
+    while copynp > 0 do
+        for i, expr in pairs(newexprs) do
+            local exprlist = likwid.stringsplit(expr, ",")
+            local seclength = math.ceil(#exprlist/ppn)
+            local offset = 0
+            for p=1, ppn do
+                local str = ""
+                for j=1, seclength do
+                    if exprlist[((p-1)*seclength) + j] then
+                        str = str .. exprlist[((p-1)*seclength) + j] ..","
+                    end
+                end
+                if str ~= "" then
+                    str = str:sub(1,#str-1)
+                    table.insert(cpuexprs, str)
+                    copynp = copynp - seclength
+                else
+                    break
+                end
+            end
+        end
+    end
+    newhosts, ppn = assignHosts(hosts, np, ppn)
+    if np < ppn*#newhosts then
+        np = 0
+        for i, host in pairs(newhosts) do
+            np = np + host["slots"]
+        end
+    end
+else
+    print("ERROR: Commandline settings are not supported.")
+    os.exit(1)
+end
+
+local grouplist = {}
+if #perf > 0 then
+    perfexprs, grouplist = setPerfStrings(perf, cpuexprs)
+end
+
+local nrNodes = getNumberOfNodes(newhosts)
+
+local pid = likwid.getpid()
+local hostfilename = string.format(".hostfile_%s.txt", pid)
+local scriptfilename = string.format(".likwidscript_%s.txt", pid)
+local outfilename = string.format(os.getenv("PWD").."/.output_%s_%%r_%%h.csv", pid)
+
+checkLikwid()
+
+if writeHostfile == nil or getEnvironment == nil or executeCommand == nil then
+    print("ERROR: Initialization for MPI specific functions failed")
+    os.exit(1)
+end
+
+writeHostfile(newhosts, hostfilename)
+writeWrapperScript(scriptfilename, table.concat(executable, " "), newhosts, outfilename)
+local env = getEnvironment()
+executeCommand(scriptfilename, hostfilename, env, nrNodes)
+
+os.remove(scriptfilename)
+os.remove(hostfilename)
+
+infilepart = ".output_"..pid
+filelist = listdir(os.getenv("PWD"), infilepart)
+all_results = {}
+if not use_marker then
+    for i, file in pairs(filelist) do
+        local host, rank, results, cpulist = parseOutputFile(file)
+        if host ~= nil and rank ~= nil then
+            if all_results[rank] == nil then
+                all_results[rank] = {}
+            end
+            all_results[rank]["hostname"] = host
+            all_results[rank]["results"] = results
+            all_results[rank]["cpus"] = cpulist
+            os.remove(file)
+        end
+    end
+    if likwid.tablelength(all_results) > 0 then
+        printMpiOutput(grouplist, all_results)
+    end
+else
+    local tmpList = {}
+    local cpuCount = 0
+    for i, file in pairs(filelist) do
+        host, rank, results, cpulist = parseMarkerOutputFile(file)
+        if host ~= nil and rank ~= nil then
+            if all_results[rank] == nil then
+                all_results[rank] = {}
+            end
+            all_results[rank]["hostname"] = host
+            all_results[rank]["cpus"] = cpulist
+            cpuCount = cpuCount + #cpulist
+            tmpList[rank] = results
+            os.remove(file)
+        end
+    end
+    if likwid.tablelength(all_results) > 0 then
+        for reg, _ in pairs(tmpList[0]) do
+            for rank,_ in pairs(all_results) do
+                all_results[rank]["results"] = tmpList[rank][reg]
+            end
+            printMpiOutput(grouplist, all_results, reg)
+        end
+    end
+end
diff --git a/src/applications/likwid-perfctr.c b/src/applications/likwid-perfctr.c
deleted file mode 100644
index 6c9f98f..0000000
--- a/src/applications/likwid-perfctr.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-perfctr.c
- *
- *      Description:  An application to read out performance counter registers
- *                  on x86 processors
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <time.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <signal.h>
-
-#include <error.h>
-#include <types.h>
-#include <bitUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <cpuFeatures.h>
-#include <perfmon.h>
-#include <daemon.h>
-#include <bstrlib.h>
-#include <numa.h>
-#include <strUtil.h>
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define HELP_MSG \
-fprintf(stdout, "likwid-perfctr --  Version  %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "\n"); \
-fprintf(stdout, "Example Usage: likwid-perfctr -C 2  ./a.out \n"); \
-fprintf(stdout, "Supported Options:\n"); \
-fprintf(stdout, "-h\t Help message\n"); \
-fprintf(stdout, "-v\t Version information\n"); \
-fprintf(stdout, "-V\t verbose output\n"); \
-fprintf(stdout, "-g\t performance group or event set string\n"); \
-fprintf(stdout, "-H\t Get group help (together with -g switch) \n"); \
-fprintf(stdout, "-t\t timeline mode with frequency in s or ms, e.g. 300ms\n"); \
-fprintf(stdout, "-S\t stethoscope mode with duration in s\n"); \
-fprintf(stdout, "-m\t use markers inside code \n"); \
-fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
-fprintf(stdout, "-o\t Store output to file, with output conversation according to file suffix\n"); \
-fprintf(stdout, "\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-fprintf(stdout, "-O\t Output easily parseable CSV instead of fancy tables\n"); \
-fprintf(stdout, "-M\t set how MSR registers are accessed: 0=direct, 1=msrd\n"); \
-fprintf(stdout, "-a\t list available performance groups\n"); \
-fprintf(stdout, "-e\t list available counters and events\n"); \
-fprintf(stdout, "-i\t print cpu info\n"); \
-fprintf(stdout, "-c\t processor ids to measure (required), e.g 0,3-4,8\n"); \
-fprintf(stdout, "-C\t processor ids to measure (this variant also cares for pinning of process/threads)\n"); \
-fprintf(stdout, "\t\t for -c and -C, see likwid-pin -h for details\n"); \
-fflush(stdout);
-
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-perfctr  %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-/* To be able to give useful error messages instead of just dieing without a
- * comment. Mainly happens because we get a SIGPIPE if the daemon drops us. */
-static void Signal_Handler(int sig)
-{
-    fprintf(stderr, "ERROR - [%s:%d] Signal %d caught\n", __FILE__, __LINE__, sig);
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-int main (int argc, char** argv)
-{
-    int optInfo = 0;
-    int optPrintGroups = 0;
-    int optPrintGroupHelp = 0;
-    int optPrintEvents = 0;
-    int optUseMarker = 0;
-    int optReport = 0;
-    int optTimeline = 0;
-    int optStethoscope = 0;
-    int optPin = 0;
-    int c;
-    bstring eventString = bfromcstr("_NOGROUP");
-    bstring  argString;
-    bstring  pinString;
-    bstring  skipString;
-    bstring  filterScript = bfromcstr("NO");
-    int skipMask = -1;
-    BitMask counterMask;
-    bstring filepath = bformat("/tmp/likwid_%u.txt", (uint32_t) getpid());
-    int numThreads = 0;
-    int threads[MAX_NUM_THREADS];
-    threads[0] = 0;
-    int i,j;
-    FILE* OUTSTREAM = stdout;
-    struct timespec interval;
-
-    if (argc ==  1)
-    {
-        HELP_MSG;
-        bdestroy(filepath);
-        bdestroy(eventString);
-        exit (EXIT_SUCCESS);
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-    affinity_init();
-
-    while ((c = getopt (argc, argv, "+ac:C:d:eg:hHimM:o:OPs:S:t:vV")) != -1)
-    {
-        switch (c)
-        {
-            case 'a':
-                numThreads = 1; /*to get over the error message */
-                threads[0] = 0;
-                optPrintGroups = 1;
-                break;
-            case 'C':
-                optPin = 1;
-                CHECK_OPTION_STRING;
-                numThreads = bstr_to_cpuset(threads, argString);
-
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-
-                break;
-            case 'c':
-                CHECK_OPTION_STRING;
-                numThreads = bstr_to_cpuset(threads, argString);
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-
-                break;
-            case 'd':
-                fprintf(stdout, "Option -d for daemon mode is deprecated. Daemon mode has be renamed to timeline mode (Option -t)!\n");
-                fflush(stdout);
-                break;
-            case 'e':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optPrintEvents = 1;
-                break;
-            case 'g':
-                CHECK_OPTION_STRING;
-                eventString = bstrcpy(argString);
-                break;
-            case 'h':
-                HELP_MSG;
-                cpuid_print();
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-            case 'H':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optPrintGroupHelp = 1;
-                break;
-            case 'i':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optInfo = 1;
-                perfmon_verbose = 1;
-                break;
-            case 'm':
-                optUseMarker = 1;
-                break;
-            case 'M':  /* Set MSR Access mode */
-                CHECK_OPTION_STRING;
-                accessClient_setaccessmode(str2int((char*) argString->data));
-                break;
-            case 'o':
-                CHECK_OPTION_STRING;
-                OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
-                if(!OUTSTREAM)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse out file pattern.);
-                }
-                break;
-            case 'O':
-                perfmon_setCSVMode(1);
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                skipMask = strtoul((char*) argString->data,NULL,16);
-                break;
-            case 'S':
-                CHECK_OPTION_STRING;
-                optStethoscope = str2int((char*) argString->data);
-                if (optStethoscope <= 0)
-                {
-                    fprintf(stderr, "The measurement time must be larger than 0\n\n");
-                    HELP_MSG;
-                    exit(EXIT_FAILURE);
-                }
-                break;
-            case 't':
-                CHECK_OPTION_STRING;
-                bstr_to_interval(argString, &interval);
-                optTimeline = 1;
-                break;
-            case 'v':
-                VERSION_MSG;
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-            case 'V':
-                perfmon_verbose = 1;
-                break;
-            case '?':
-                if (optopt == 'S'||optopt == 't'||optopt == 'c'||optopt == 'C'||
-                    optopt == 'o'||optopt == 'M'||optopt == 'g')
-                {
-
-                }
-                else if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (!numThreads)
-    {
-        fprintf (stderr, "ERROR: Required -c. You must specify at least one processor.\n");
-        HELP_MSG;
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPin)
-    {
-
-        if ( getenv("OMP_NUM_THREADS") == NULL )
-        {
-            argString = bformat("%d",numThreads);
-            setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
-        }
-
-        if (numThreads > 1)
-        {
-            bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
-            pinString = bformat("%d",threads[1]);
-
-            for (i=2; i < numThreads;i++)
-            {
-                bformata(pinString,",%d",threads[i]);
-            }
-
-            bformata(pinString,",%d",threads[0]);
-
-            if (skipMask > 0)
-            {
-                skipString = bformat("%d",skipMask);
-                setenv("LIKWID_SKIP",(char*) skipString->data , 1);
-            }
-            setenv("KMP_AFFINITY", "disabled", 1);
-            setenv("LIKWID_PIN",(char*) pinString->data , 1);
-
-            setenv("LIKWID_SILENT","true", 1);
-            if (ldPreload == NULL)
-            {
-                setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
-            }
-            else
-            {
-                bconchar(ldPreload, ':');
-                bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
-                setenv("LD_PRELOAD", bdata(ldPreload), 1);
-            }
-        }
-
-        affinity_pinProcess(threads[0]);
-    }
-
-
-    for (i = 0; i< numThreads;i++)
-    {
-        for (j = 0; j< numThreads;j++)
-        {
-            if(i != j && threads[i] == threads[j])
-            {
-                fprintf (stderr, "ERROR: Processor list (%d",threads[0]);
-                for (c=1;c<numThreads;c++)
-                {
-                    fprintf (stderr, ",%d",threads[c]);
-                }
-                fprintf (stderr, ") is not unique.\n");
-                exit(EXIT_FAILURE);
-            }
-        }
-    }
-
-    { /* Init signal handler */
-        struct sigaction sia;
-        sia.sa_handler = Signal_Handler;
-        sigemptyset(&sia.sa_mask);
-        sia.sa_flags = 0;
-        sigaction(SIGPIPE, &sia, NULL);
-    }
-
-    perfmon_init(numThreads, threads, OUTSTREAM);
-
-    if (perfmon_verbose)
-    {
-        fprintf(OUTSTREAM,"CPU family:\t%u \n",cpuid_info.family);
-        fprintf(OUTSTREAM,"CPU model:\t%u \n", cpuid_info.model);
-        fprintf(OUTSTREAM,"CPU stepping:\t%u \n", cpuid_info.stepping);
-        fprintf(OUTSTREAM,"CPU features:\t%s \n", cpuid_info.features);
-
-        if( cpuid_info.family == P6_FAMILY && cpuid_info.perf_version)
-        {
-            fprintf(OUTSTREAM,HLINE);
-            fprintf(OUTSTREAM,"PERFMON version:\t%u \n",cpuid_info.perf_version);
-            fprintf(OUTSTREAM,"PERFMON number of counters:\t%u \n",cpuid_info.perf_num_ctr);
-            fprintf(OUTSTREAM,"PERFMON width of counters:\t%u \n",cpuid_info.perf_width_ctr);
-            fprintf(OUTSTREAM,"PERFMON number of fixed counters:\t%u \n",cpuid_info.perf_num_fixed_ctr);
-        }
-    }
-    fprintf(OUTSTREAM,HLINE);
-    fflush(OUTSTREAM);
-
-    if (optInfo)
-    {
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintGroups)
-    {
-        perfmon_printAvailableGroups();
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintGroupHelp)
-    {
-        perfmon_printGroupHelp(eventString);
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintEvents)
-    {
-        perfmon_printCounters();
-        perfmon_printEvents();
-        exit (EXIT_SUCCESS);
-    }
-    if ((!optTimeline && !optStethoscope) && (optind == argc))
-    {
-        fprintf(OUTSTREAM,"NOTICE: You have to specify a program to measure as argument!\n");
-        exit (EXIT_SUCCESS);
-    }
-    argv +=  optind;
-    bstring exeString = bfromcstr(argv[0]);
-    for (i=1; i<(argc-optind); i++)
-        {
-            bconchar(exeString, ' ');
-            bcatcstr(exeString, argv[i]);
-        }
-    if (blength(exeString) == 0 && !optStethoscope)
-    {
-        fprintf(OUTSTREAM, "Executable must be given on commandline\n");
-        fflush(OUTSTREAM);
-        exit(EXIT_FAILURE);
-    }
-    if (biseqcstr(eventString,"_NOGROUP"))
-    {
-        fprintf(OUTSTREAM,"NOTICE: You have to specify a group or event set to measure using the -g option.\n");
-        fprintf(OUTSTREAM,"        Use likwid-perfctr -a to get a list of available groups and likwid-perfctr -e for supported events.\n\n");
-        exit (EXIT_SUCCESS);
-    }
-
-    timer_init();
-
-    fprintf(OUTSTREAM,HLINE);
-    fprintf(OUTSTREAM,"CPU type:\t%s \n",cpuid_info.name);
-    fprintf(OUTSTREAM,"CPU clock:\t%3.2f GHz \n",  (float) timer_getCpuClock() * 1.E-09);
-    fflush(OUTSTREAM);
-
-    fprintf(OUTSTREAM,HLINE);
-    fflush(OUTSTREAM);
-
-    if (optStethoscope)
-    {
-        perfmon_setupEventSet(eventString, &counterMask);
-        perfmon_startCounters();
-        sleep(optStethoscope);
-        perfmon_stopCounters();
-        perfmon_printCounterResults();
-    }
-    else if (optTimeline)
-    {
-        fprintf(OUTSTREAM,"CORES: %d", threads[0]);
-        for (int i=1; i<numThreads; i++)
-        {
-            fprintf(OUTSTREAM," %d", threads[i]);
-        }
-        fprintf(OUTSTREAM," \n");
-        fflush(OUTSTREAM);
-
-        daemon_start(eventString, interval);
-        if (system(bdata(exeString)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-            exit(EXIT_FAILURE);
-        }
-        daemon_stop(SIGINT);
-    }
-    else
-    {
-        if (perfmon_verbose)
-        {
-            fprintf(OUTSTREAM,"Executing: %s \n",bdata(exeString));
-            fflush(OUTSTREAM);
-        }
-
-        if (optReport)
-        {
-            //        multiplex_start();
-        }
-        else if (!optUseMarker && !optTimeline)
-        {
-            perfmon_setupEventSet(eventString, &counterMask);
-            perfmon_startCounters();
-        }
-        else
-        {
-            if (getenv("LIKWID_FILEPATH") == NULL)
-                setenv("LIKWID_FILEPATH",(char*) filepath->data, 1);
-            perfmon_setupEventSet(eventString, &counterMask);
-            char* modeStr = (char*) malloc(40 * sizeof(char));
-            sprintf(modeStr,"%d",accessClient_mode);
-            setenv("LIKWID_MODE", modeStr, 1);
-            bitMask_toString(modeStr,counterMask);
-            setenv("LIKWID_MASK", modeStr, 1);
-            free(modeStr);
-
-            perfmon_startCounters();
-        }
-
-        if (system(bdata(exeString)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-            exit(EXIT_FAILURE);
-        }
-
-        if (optReport)
-        {
-            //        multiplex_stop();
-            //        perfmon_printReport(&set);
-        }
-        else
-        {
-            if (optUseMarker)
-            {
-                perfmon_stopCounters();
-                perfmon_printMarkerResults(filepath);
-            }
-            else
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-            }
-        }
-    }
-
-    bdestroy(filepath);
-    bdestroy(exeString);
-    perfmon_finalize();
-    fflush(OUTSTREAM);
-    fclose(OUTSTREAM);
-    /* call filterscript if specified */
-    if (!biseqcstr(filterScript,"NO"))
-    {
-        bcatcstr(filterScript, " perfctr");
-        if (system(bdata(filterScript)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua
new file mode 100644
index 0000000..f49ecc7
--- /dev/null
+++ b/src/applications/likwid-perfctr.lua
@@ -0,0 +1,775 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-perfctr.lua
+ *
+ *      Description:  An application to read out performance counter registers
+ *                    on x86 processors
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-perfctr --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run command on CPU 2 and measure performance group TEST:")
+    print("likwid-perfctr -C 2 -g TEST ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to read out performance counter registers on x86 processors\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-c <list>\t\t Processor ids to measure (required), e.g. 1,2-4,8")
+    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+    print("\t\t\t For information about the <list> syntax, see likwid-pin")
+    print("-g, --group <string>\t Performance group or custom event set string")
+    print("-H\t\t\t Get group help (together with -g switch)")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+    print("-a\t\t\t List available performance groups")
+    print("-e\t\t\t List available events and counter registers")
+    print("-E <string>\t\t List available events and corresponding counters that match <string>")
+    print("-i, --info\t\t Print CPU info")
+    print("-T <time>\t\t Switch eventsets with given frequency")
+    print("-f, --force\t\t Force overwrite of registers if they are in use")
+    print("Modes:")
+    print("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms")
+    print("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms")
+    print("-m, --marker\t\t Use Marker API inside code")
+    print("Output options:")
+    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter according to filename suffix)")
+    print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+    print("--stats\t\t\t Always print statistics table")
+    print("\n")
+    examples()
+end
+
+
+local config = likwid.getConfiguration()
+verbose = 0
+print_groups = false
+print_events = false
+print_event = nil
+print_info = false
+cpulist = nil
+num_cpus = 0
+pin_cpus = false
+group_string = nil
+event_string = nil
+event_string_list = {}
+avail_groups = {}
+num_avail_groups = 0
+group_list = {}
+group_ids = {}
+activeGroup = 0
+print_group_help = false
+skip_mask = nil
+counter_mask = {}
+access_flags = "e"
+if config["daemonMode"] < 0 then
+    access_mode = 1
+else
+    access_mode = config["daemonMode"]
+    if access_mode == 0 then
+        access_flags = "rw"
+    end
+end
+set_access_modes = false
+use_marker = false
+use_stethoscope = false
+use_timeline = false
+daemon_run = 0
+use_wrapper = false
+duration = 2.E06
+switch_interval = 5
+output = ""
+use_csv = false
+print_stats = false
+execString = nil
+outfile = nil
+forceOverwrite = 0
+gotC = false
+markerFile = string.format("/tmp/likwid_%d.txt",likwid.getpid())
+print_stdout = print
+cpuClock = 1
+likwid.catchSignal()
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "i", "m", "M:", "o:", "O", "P", "s:", "S:", "t:", "v", "V:", "T:", "f", "group:", "help", "info", "version", "verbose:", "output:", "skip:", "marker", "force", "stats"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print_stdout(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stdout("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "c") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        gotC = true
+    elseif (opt == "C") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        pin_cpus = true
+        gotC = true
+    elseif (opt == "a") then
+        print_groups = true
+    elseif (opt == "e") then
+        print_events = true
+    elseif (opt == "E") then
+        print_event = arg
+    elseif opt == "f" or opt == "force" then
+        forceOverwrite = 1
+    elseif opt == "g" or opt == "group" then
+        table.insert(event_string_list, arg)
+    elseif (opt == "H") then
+        print_group_help = true
+    elseif opt == "s" or opt == "skip" then
+        if arg:match("0x[0-9A-F]") then
+            skip_mask = arg
+        else
+            if arg:match("[0-9A-F]") then
+                print("Given skip mask looks like hex, sanitizing arg to 0x"..arg)
+                skip_mask = "0x"..arg
+            else
+                print("Skip mask must be given in hex")
+            end
+        end
+    elseif (opt == "M") then
+        access_mode = tonumber(arg)
+        set_access_modes = true
+        if access_mode == 0 then
+            access_flags = "rw"
+        else
+            access_flags = "e"
+        end
+        if (access_mode < 0 and access_mode > 1) then
+            print_stdout("Access mode must be 0 for direct access and 1 for access daemon")
+            os.exit(1)
+        end
+    elseif opt == "i" or opt == "info" then
+        print_info = true
+        verbose = true
+    elseif opt == "m" or opt == "marker" then
+        use_marker = true
+        use_wrapper = true
+    elseif (opt == "S") then
+        use_stethoscope = true
+        duration = likwid.parse_time(arg)
+    elseif (opt == "t") then
+        use_timeline = true
+        duration = likwid.parse_time(arg)
+    elseif (opt == "T") then
+        duration = likwid.parse_time(arg)
+    elseif opt == "o" or opt == "output" then
+        local suffix = ""
+        if string.match(arg, "%.") then
+            suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+        end
+        if suffix ~= "txt" then
+            use_csv = true
+        end
+        outfile = arg:gsub("%%h", likwid.gethostname())
+        outfile = outfile:gsub("%%p", likwid.getpid())
+        outfile = outfile:gsub("%%j", likwid.getjid())
+        outfile = outfile:gsub("%%r", likwid.getMPIrank())
+        io.output(outfile..".tmp")
+        print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+    elseif (opt == "O") then
+        use_csv = true
+    elseif (opt == "stats") then
+        print_stats = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+io.stdout:setvbuf("no")
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+
+if not likwid.msr_available(access_flags) then
+    if access_mode == 1 then
+        print_stdout("MSR device files not available")
+        print_stdout("Please load msr kernel module before retrying")
+        os.exit(1)
+    else
+        print_stdout("MSR device files not readable and writeable")
+        print_stdout("Be sure that you have enough permissions to access the MSR files directly")
+        os.exit(1)
+    end
+end
+
+if num_cpus == 0 and
+   not gotC and
+   not print_events and
+   print_event == nil and
+   not print_groups and
+   not print_group_help and
+   not print_info then
+    print_stdout("Option -c <list> or -C <list> must be given on commandline")
+    usage()
+    os.exit(1)
+elseif num_cpus == 0 and
+       gotC and
+       not print_events and
+       print_event == nil and
+       not print_groups and
+       not print_group_help and
+       not print_info then
+    print_stdout("CPUs given on commandline are not valid in current environment, maybe it's limited by a cpuset.")
+    os.exit(1)
+end
+
+
+if num_cpus > 0 then
+    for i,cpu1 in pairs(cpulist) do
+        for j, cpu2 in pairs(cpulist) do
+            if i ~= j and cpu1 == cpu2 then
+                print_stdout("List of CPUs is not unique, got two times CPU " .. tostring(cpu1))
+                os.exit(1)
+            end
+        end
+    end
+end
+
+
+
+if print_events == true then
+    local tab = likwid.getEventsAndCounters()
+    print_stdout(string.format("This architecture has %d counters.", #tab["Counters"]))
+    local outstr = "Counters names: "
+    print_stdout("Counter tags(name, type<, options>):")
+    for _, counter in pairs(tab["Counters"]) do
+        outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+        if counter["Options"]:len() > 0 then
+            outstr = outstr .. string.format(", %s",counter["Options"])
+        end
+        print_stdout(outstr)
+    end
+    print_stdout("\n\n")
+    print_stdout(string.format("This architecture has %d events.",#tab["Events"]))
+    print_stdout("Event tags (tag, id, umask, counters<, options>):")
+    for _, eventTab in pairs(tab["Events"]) do
+        outstr = eventTab["Name"] .. ", "
+        outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+        outstr = outstr .. eventTab["Limit"]
+        if #eventTab["Options"] > 0 then
+            outstr = outstr .. string.format(", %s",eventTab["Options"])
+        end
+        print_stdout(outstr)
+    end
+    os.exit(0)
+end
+
+if print_event ~= nil then
+    function case_insensitive_pattern(pattern)
+        local p = pattern:gsub("(%%?)(.)", function(percent, letter)
+            if percent ~= "" or not letter:match("%a") then
+              return percent .. letter
+            else
+                return string.format("[%s%s]", letter:lower(), letter:upper())
+            end
+        end)
+        return p
+    end
+    local tab = likwid.getEventsAndCounters()
+    local events = {}
+    local counters = {}
+    local outstr = ""
+    for _, eventTab in pairs(tab["Events"]) do
+        if eventTab["Name"]:match(case_insensitive_pattern(print_event)) then
+            table.insert(events, eventTab)
+        end
+    end
+    for _, counter in pairs(tab["Counters"]) do
+        for _, event in pairs(events) do
+            if counter["Name"]:match(event["Limit"]) then
+                counters[counter["Name"]] = counter
+            end
+        end
+    end
+    print_stdout(string.format("Found %d event(s) with search key %s:", #events, print_event))
+    for _, eventTab in pairs(events) do
+        outstr = eventTab["Name"] .. ", "
+        outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+        outstr = outstr .. eventTab["Limit"]
+        if #eventTab["Options"] > 0 then
+            outstr = outstr .. string.format(", %s",eventTab["Options"])
+        end
+        print_stdout(outstr)
+    end
+    print_stdout("\nUsable counter(s) for above event(s):")
+    for i, counter in pairs(counters) do
+        outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+        if counter["Options"]:len() > 0 then
+            outstr = outstr .. string.format(", %s",counter["Options"])
+        end
+        print_stdout(outstr)
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+avail_groups = likwid.getGroups()
+if print_groups == true then
+    print_stdout(string.format("%11s\t%s","Group name", "Description"))
+    print_stdout(likwid.hline)
+    for i,g in pairs(avail_groups) do
+        print_stdout(string.format("%11s\t%s",g["Name"], g["Info"]))
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if print_group_help == true then
+    if #event_string_list == 0 then
+        print_stdout("Group(s) must be given on commandline to get group help")
+        os.exit(1)
+    end
+    for i,event_string in pairs(event_string_list) do
+        local s,e = event_string:find(":")
+        if s ~= nil then
+            print_stdout("Given string is no group")
+            os.exit(1)
+        end
+        for i,g in pairs(avail_groups) do
+            if event_string == g["Name"] then
+                print_stdout(string.format("Group %s:",g["Name"]))
+                print_stdout(g["Long"])
+            end
+        end
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if #event_string_list == 0 and not print_info then
+    print_stdout("Option(s) -g <string> must be given on commandline")
+    usage()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+if (cpuinfo["clock"] > 0) then
+    cpuClock = cpuinfo["clock"]
+else
+    cpuClock = likwid.getCpuClock()
+end
+
+if outfile == nil then
+    print_stdout(likwid.hline)
+    print_stdout(string.format("CPU name:\t%s",cpuinfo["osname"]))
+    print_stdout(string.format("CPU type:\t%s",cpuinfo["name"]))
+    print_stdout(string.format("CPU clock:\t%3.2f GHz",cpuClock * 1.E-09))
+end
+
+if print_info or verbose > 0 then
+    print_stdout(string.format("CPU family:\t%u", cpuinfo["family"]))
+    print_stdout(string.format("CPU model:\t%u", cpuinfo["model"]))
+    print_stdout(string.format("CPU short:\t%s", cpuinfo["short_name"]))
+    print_stdout(string.format("CPU stepping:\t%u", cpuinfo["stepping"]))
+    print_stdout(string.format("CPU features:\t%s", cpuinfo["features"]))
+    P6_FAMILY = 6
+    if cpuinfo["family"] == P6_FAMILY and cpuinfo["perf_version"] > 0 then
+        print_stdout(likwid.hline)
+        print_stdout(string.format("PERFMON version:\t%u",cpuinfo["perf_version"]))
+        print_stdout(string.format("PERFMON number of counters:\t%u",cpuinfo["perf_num_ctr"]))
+        print_stdout(string.format("PERFMON width of counters:\t%u",cpuinfo["perf_width_ctr"]))
+        print_stdout(string.format("PERFMON number of fixed counters:\t%u",cpuinfo["perf_num_fixed_ctr"]))
+    end
+    print_stdout(likwid.hline)
+    if print_info then
+        likwid.printSupportedCPUs()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(0)
+    end
+end
+
+if use_stethoscope == false and use_timeline == false and use_marker == false then
+    use_wrapper = true
+end
+
+if use_wrapper and likwid.tablelength(arg)-2 == 0 and print_info == false then
+    print_stdout("No Executable can be found on commandline")
+    usage()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if use_marker then
+    if likwid.access(markerFile, "rw") ~= -1 then
+        print_stdout(string.format("ERROR: MarkerAPI file %s not accessible. Maybe a remaining file of another user.", markerFile))
+        print_stdout("Please purge all MarkerAPI files from /tmp.")
+        os.exit(1)
+    end
+    if not pin_cpus then
+        print_stdout("Warning: The Marker API requires the application to run on the selected CPUs.")
+        print_stdout("Warning: likwid-perfctr pins the application only when using the -C command line option.")
+        print_stdout("Warning: LIKWID assumes that the application does it before the first instrumented code region is started.")
+        print_stdout("Warning: You can use the string in the environment variable LIKWID_THREADS to pin you application to")
+        print_stdout("Warning: to the CPUs specified after the -c command line option.")
+    end
+end
+
+if verbose == 0 then
+    likwid.setenv("LIKWID_SILENT","true")
+end
+
+if pin_cpus then
+    local omp_threads = os.getenv("OMP_NUM_THREADS")
+    if omp_threads == nil then
+        likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_cpus)))
+    elseif num_cpus > tonumber(omp_threads) then
+        print_stdout(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_cpus))
+    end
+    if os.getenv("CILK_NWORKERS") == nil then
+        likwid.setenv("CILK_NWORKERS", tostring(math.tointeger(num_cpus)))
+    end
+    if skip_mask then
+        likwid.setenv("LIKWID_SKIP",skip_mask)
+    end
+    likwid.setenv("KMP_AFFINITY","disabled")
+
+    if num_cpus > 1 then
+        local pinString = tostring(math.tointeger(cpulist[2]))
+        for i=3,likwid.tablelength(cpulist) do
+            pinString = pinString .. "," .. tostring(math.tointeger(cpulist[i]))
+        end
+        pinString = pinString .. "," .. tostring(math.tointeger(cpulist[1]))
+        likwid.setenv("LIKWID_PIN", pinString)
+
+        local preload = os.getenv("LD_PRELOAD")
+        if preload == nil then
+            likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+        else
+            likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+        end
+    elseif num_cpus == 1 then
+        likwid.setenv("LIKWID_PIN", tostring(math.tointeger(cpulist[1])))
+        if verbose > 0 then
+            likwid.pinProcess(cpulist[1], 0)
+        else
+            likwid.pinProcess(cpulist[1], 1)
+        end
+    end
+end
+
+
+
+--[[for i, event_string in pairs(event_string_list) do
+    local groupdata = likwid.get_groupdata(event_string)
+    if groupdata == nil then
+        print_stdout("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+        usage()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+    table.insert(group_list, groupdata)
+    event_string_list[i] = groupdata["EventString"]
+end]]
+
+
+if set_access_modes then
+    if likwid.setAccessClientMode(access_mode) ~= 0 then
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+end
+if likwid.init(num_cpus, cpulist) < 0 then
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+likwid.setenv("LIKWID_FORCE", tostring(forceOverwrite))
+for i, event_string in pairs(event_string_list) do
+    if event_string:len() > 0 then
+        local gid = likwid.addEventSet(event_string)
+        if gid < 0 then
+            likwid.putTopology()
+            likwid.putConfiguration()
+            likwid.finalize()
+            os.exit(1)
+        end
+        table.insert(group_ids, gid)
+    end
+end
+if #group_ids == 0 then
+    print("ERROR: No valid eventset given on commandline. Exiting...")
+    likwid.putTopology()
+    likwid.putConfiguration()
+    likwid.finalize()
+    os.exit(1)
+end
+
+activeGroup = group_ids[1]
+likwid.setupCounters(activeGroup)
+if outfile == nil then
+    print_stdout(likwid.hline)
+end
+
+if use_marker == true then
+    likwid.setenv("LIKWID_FILEPATH", markerFile)
+    likwid.setenv("LIKWID_MODE", tostring(access_mode))
+    likwid.setenv("LIKWID_DEBUG", tostring(verbose))
+    local str = table.concat(event_string_list, "|")
+    likwid.setenv("LIKWID_EVENTS", str)
+    likwid.setenv("LIKWID_THREADS", table.concat(cpulist,","))
+    likwid.setenv("LIKWID_FORCE", "-1")
+end
+
+execString = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+if verbose == true then
+    print_stdout(string.format("Executing: %s",execString))
+end
+local ldpath = os.getenv("LD_LIBRARY_PATH")
+local libpath = likwid.pinlibpath:match("([/%g]+)/%g+.so")
+if ldpath == nil then
+    likwid.setenv("LD_LIBRARY_PATH", libpath)
+elseif not ldpath:match(libpath) then
+    likwid.setenv("LD_LIBRARY_PATH", libpath..":"..ldpath)
+end
+
+
+if use_timeline == true then
+    local cores_string = "CORES: "
+    for i, cpu in pairs(cpulist) do
+        cores_string = cores_string .. tostring(cpu) .. "|"
+    end
+    io.stderr:write("# "..cores_string:sub(1,cores_string:len()-1).."\n")
+    for gid, group in pairs(group_list) do
+        local strlist = {}
+        if group["Metrics"] == nil then
+            for i,e in pairs(group["Events"]) do
+                table.insert(strlist, e["Event"])
+            end
+        else
+            for i,e in pairs(group["Metrics"]) do
+                table.insert(strlist, e["description"])
+            end
+        end
+        io.stderr:write("# "..table.concat(strlist, "|").."\n")
+    end
+end
+
+
+
+io.stdout:flush()
+local groupTime = {}
+if use_wrapper or use_timeline then
+    local start = likwid.startClock()
+    local stop = 0
+    local alltime = 0
+    local nr_events = likwid.getNumberOfEvents(activeGroup)
+    local nr_threads = likwid.getNumberOfThreads()
+    local firstrun = true
+    
+    if use_wrapper and #group_ids == 1 then
+        duration = 30.E06
+    end
+
+    local ret = likwid.startCounters()
+    if ret < 0 then
+        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        os.exit(1)
+    end
+
+    local pid = nil
+    if pin_cpus then
+        pid = likwid.startProgram(execString, #cpulist, cpulist)
+    else
+        pid = likwid.startProgram(execString, 0, cpulist)
+    end
+
+    if not pid then
+        print_stdout("Failed to execute command: ".. execString)
+    end
+    start = likwid.startClock()
+    groupTime[activeGroup] = 0
+    while true do
+        if likwid.getSignalState() ~= 0 then
+            likwid.killProgram()
+            break
+        end
+        local remain = likwid.sleep(duration)
+        if remain > 0 or not likwid.checkProgram(pid) then
+            io.stdout:flush()
+            break
+        end
+        if use_timeline == true then
+            stop = likwid.stopClock()
+            likwid.stopCounters()
+            
+            local time = likwid.getClock(start, stop)
+            if likwid.getNumberOfMetrics(activeGroup) == 0 then
+                results = likwid.getLastResults()
+            else
+                results = likwid.getLastMetrics()
+            end
+            str = tostring(math.tointeger(activeGroup)) .. " "..tostring(#results[activeGroup]).." "..tostring(#cpulist).." "..tostring(time)
+            for i,l1 in pairs(results[activeGroup]) do
+                for j, value in pairs(l1) do
+                    str = str .. " " .. tostring(value)
+                end
+            end
+            io.stderr:write(str.."\n")
+            groupTime[activeGroup] = time
+            likwid.startCounters()
+        else
+            likwid.readCounters()
+        end
+        if #group_ids > 1 then
+            likwid.switchGroup(activeGroup + 1)
+            activeGroup = likwid.getIdOfActiveGroup()
+            if groupTime[activeGroup] == nil then
+                groupTime[activeGroup] = 0
+            end
+            nr_events = likwid.getNumberOfEvents(activeGroup)
+        end
+        
+    end
+    stop = likwid.stopClock()
+elseif use_stethoscope then
+    local ret = likwid.startCounters()
+    if ret < 0 then
+        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        os.exit(1)
+    end
+    likwid.sleep(duration)
+elseif use_marker then
+    local ret = likwid.startCounters()
+    if ret < 0 then
+        print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+        os.exit(1)
+    end
+    local ret = os.execute(execString)
+    if ret == nil then
+        print_stdout("Failed to execute command: ".. execString)
+    end
+end
+
+local ret = likwid.stopCounters()
+if ret < 0 then
+    print_stdout(string.format("Error stopping counters for thread %d.",ret * (-1)))
+    likwid.finalize()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+io.stdout:flush()
+if outfile == nil then
+    print_stdout(likwid.hline)
+end
+
+
+if use_marker == true then
+    results, metrics = likwid.getMarkerResults(markerFile, cpulist)
+    if #results == 0 then
+        print_stdout("No regions could be found in Marker API result file")
+    else
+        for r=1, #results do
+            likwid.printOutput(results[r], metrics[r], cpulist, r, print_stats)
+        end
+    end
+    os.remove(markerFile)
+elseif use_timeline == false then
+    results = likwid.getResults()
+    metrics = likwid.getMetrics()
+    likwid.printOutput(results, metrics, cpulist, nil, print_stats)
+end
+
+if outfile then
+    local suffix = ""
+    if string.match(outfile,"%.") then
+        suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+    end
+    local command = "<INSTALLED_PREFIX>/share/likwid/filter/" .. suffix
+    local tmpfile = outfile..".tmp"
+    if suffix == "" then
+        os.rename(tmpfile, outfile)
+    elseif suffix ~= "txt" and suffix ~= "csv" and likwid.access(command, "x") then
+        print_stdout("Cannot find filter script, save output in CSV format to file "..outfile)
+        os.rename(tmpfile, outfile)
+    else
+        if suffix ~= "txt" and suffix ~= "csv" then
+            command = command .." ".. tmpfile .. " perfctr"
+            local f = assert(io.popen(command))
+            if f ~= nil then
+                local o = f:read("*a")
+                if o:len() > 0 then
+                    print_stdout(string.format("Failed to executed filter script %s.",command))
+                end
+            else
+                print_stdout("Failed to call filter script, save output in CSV format to file "..outfile)
+                os.rename(tmpfile, outfile)
+                os.remove(tmpfile)
+            end
+        else
+            os.rename(tmpfile, outfile)
+            os.remove(tmpfile)
+        end
+    end
+end
+
+likwid.finalize()
+likwid.putTopology()
+likwid.putNumaInfo()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-perfscope.lua b/src/applications/likwid-perfscope.lua
new file mode 100644
index 0000000..c1165a7
--- /dev/null
+++ b/src/applications/likwid-perfscope.lua
@@ -0,0 +1,560 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-perfscope.lua
+ *
+ *      Description:  An application to use the timeline mode of likwid-perfctr to generate
+ *                    realtime plots using feedGnuplot
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+PERFCTR="<INSTALLED_BINPREFIX>/likwid-perfctr"
+FEEDGNUPLOT="<INSTALLED_BINPREFIX>/feedGnuplot"
+
+local predefined_plots = {
+    FLOPS_DP = {
+        perfgroup = "FLOPS_DP",
+        ymetricmatch = "MFlops/s",
+        title = "Double Precision Flop Rate",
+        ytitle = "MFlops/s",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    FLOPS_SP = {
+        perfgroup = "FLOPS_SP",
+        ymetricmatch = "MFlops/s",
+        title = "Single Precision Flop Rate",
+        ytitle = "MFlops/s",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    L2 = {
+        perfgroup = "L2",
+        ymetricmatch = "L2D load bandwidth [MBytes/s]",
+        title = "L2 cache bandwidth",
+        ytitle = "Load Bandwidth [MBytes/s]",
+        y2metricmatch = "L2D evict bandwidth [MBytes/s]",
+        y2title = "Evict Bandwidth [MBytes/s]",
+        xtitle = "Time"
+    },
+    L3 = {
+        perfgroup = "L3",
+        ymetricmatch = "L3 load bandwidth [MBytes/s]",
+        title = "L3 cache bandwidth",
+        ytitle = "Load Bandwidth [MBytes/s]",
+        y2title = "Evict Bandwidth [MBytes/s]",
+        y2metricmatch = "L3 evict bandwidth [MBytes/s]",
+        xtitle = "Time"
+    },
+    MEM = {
+        perfgroup = "MEM",
+        ymetricmatch = "Memory bandwidth [MBytes/s]",
+        title = "Memory bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    QPI = {
+        perfgroup = "QPI",
+        ymetricmatch = "QPI data bandwidth [MByte/s]",
+        title = "QPI bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time",
+        y2metricmatch = "QPI link bandwidth [MByte/s]"
+    },
+    ENERGY = {
+        perfgroup = "ENERGY",
+        ymetricmatch = "Power [W]",
+        title = "Consumed energy",
+        ytitle = "Power [W]",
+        y2title = "Power DRAM [W]",
+        y2metricmatch = "Power DRAM [W]",
+        xtitle = "Time"
+    },
+    TEMP = {
+        perfgroup = "ENERGY",
+        ymetricmatch = "Temperature [C]",
+        title = "Temperature",
+        ytitle = "Temperature [C]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    NUMA = {
+        perfgroup = "NUMA",
+        ymetricmatch = "Local DRAM bandwidth [MByte/s]",
+        title = "NUMA separated memory bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2metricmatch = "Remote DRAM bandwidth [MByte/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+}
+
+local function version()
+    print(string.format("likwid-perfscope --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run command on CPU 2 and measure performance group TEST:")
+    print("likwid-perfscope -C 2 -g TEST -f 1s ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to generate pictures on-the-fly from likwid-perfctr measurements\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-a\t\t\t Print all preconfigured plot configurations for the current system.")
+    print("-c <list>\t\t Processor ids to measure, e.g. 1,2-4,8")
+    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+    print("-g, --group <string>\t Preconfigured plot group or custom event set string with plot config. See man page for information.")
+    print("-t, --time <time>\t Frequency in s, ms or us, e.g. 300ms, for the timeline mode of likwid-perfctr")
+    print("-f, --force\t\t Overwrite counter configuration although already in use")
+    print("-d, --dump\t\t Print output as it is send to feedGnuplot.")
+    print("-p, --plotdump\t\t Use dump functionality of feedGnuplot. Plots out plot configurations plus data to directly submit to gnuplot")
+    print("--host <host>\t\t Run likwid-perfctr on the selected host using SSH. Evaluation and plotting is done locally.")
+    print("\t\t\t This can be used for machines that have no gnuplot installed. All paths must be similar to the local machine.")
+    print("\n")
+    examples()
+end
+
+local function test_gnuplot()
+    cmd = "which gnuplot"
+    f = io.popen(cmd)
+    if f ~= nil then
+        io.close(f)
+        return true
+    end
+    return false
+end
+
+local eventStrings = {}
+local terminal = "x11"
+local num_cpus = 0
+local cpulist = {}
+local matchstring = nil
+local group_list = {}
+local timeline = "1s"
+local print_configs = false
+local pinning = false
+local dump = false
+local plotdump = false
+local nrgroups, allgroups = likwid.get_groups()
+local mfreq = 1.0
+local plotrange = 0
+local host = nil
+local force = false
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","g:","C:","c:","t:","r:","a","d","p","f","help", "version","group:","time:","dump","range:","plotdump","all", "host:", "force"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "g" or opt == "group" then
+        table.insert(eventStrings, arg)
+    elseif (opt == "c") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+    elseif (opt == "C") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        pinning = true
+    elseif opt == "t" or opt == "time" then
+        timeline = arg
+        mfreq = likwid.parse_time(timeline) * 1.E-6
+    elseif opt == "d" or opt == "dump" then
+        dump = true
+    elseif opt == "p" or opt == "plotdump" then
+        plotdump = true
+    elseif opt == "r" or opt == "range" then
+        plotrange = tonumber(arg)
+    elseif opt == "a" or opt == "all" then
+        print_configs = true
+    elseif opt == "host" then
+        host = arg
+    elseif opt == "f" or opt == "force" then
+        force = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+if print_configs then
+    local num_groups, all_groups = likwid.get_groups()
+    for name, config in pairs(predefined_plots) do
+        for i,g in pairs(all_groups) do
+            if g == config["perfgroup"] then
+                print("Group "..name)
+                print("\tPerfctr group: "..config["perfgroup"])
+                print("\tMatch for metric: "..config["ymetricmatch"])
+                print("\tTitle of plot: "..config["title"])
+                print("\tTitle of x-axis: "..config["xtitle"])
+                print("\tTitle of y-axis: "..config["ytitle"])
+                if config["y2metricmatch"] then
+                    print("\tMatch for second metric: "..config["y2metricmatch"])
+                end
+                if config["y2title"] then
+                    print("\tTitle of y2-axis: "..config["y2title"])
+                elseif config["y2metricmatch"] then
+                    print("\tTitle of y2-axis: "..config["ytitle"])
+                end
+                print("")
+                break
+            end
+        end
+    end
+    os.exit(0)
+end
+
+if not test_gnuplot() then
+    print("GnuPlot not available")
+    os.exit(1)
+end
+
+if num_cpus == 0 then
+    print("ERROR: CPU string must be given")
+    os.exit(1)
+end
+
+if #arg == 0 then
+    print("ERROR: Executable must be given on commandline")
+    os.exit(1)
+end
+
+for i, event_def in pairs(eventStrings) do
+    local eventlist = likwid.stringsplit(event_def,",")
+
+    event_string = nil
+    plotgroup = nil
+    plotgroupconfig = nil
+    plotdefgroup = false
+    for j, preconf in pairs(predefined_plots) do
+        if eventlist[1] == j then
+            for j,g in pairs(allgroups) do
+                if g == preconf["perfgroup"] then
+                    event_string = preconf["perfgroup"]
+                    plotdefgroup = true
+                    plotgroupconfig = preconf
+                    plotgroup = j
+                    break;
+                end
+            end
+            break;
+        end
+    end
+    if #eventlist > 1 then
+        outopts = eventlist[#eventlist]
+        table.remove(eventlist, #eventlist)
+    end
+    if event_string == nil then
+        if plotdefgroup == false then
+            event_string = table.concat(eventlist,",")
+        end
+    end
+
+    local groupdata = nil
+    groupdata = likwid.get_groupdata(event_string)
+    if groupdata == nil then
+        print("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+        usage()
+        os.exit(1)
+    end
+    if group_list[i] == nil then
+        group_list[i] = {}
+    end
+    group_list[i]["gdata"] = groupdata
+
+    formulalist = nil
+    local title = nil
+    local ytitle = nil
+    local y2title = nil
+    local y2funcindex = nil
+    local xtitle = nil
+    local output = nil
+    if plotgroup ~= nil then
+        title = plotgroupconfig["title"]
+        ytitle = plotgroupconfig["ytitle"]
+        xtitle = plotgroupconfig["xtitle"]
+        if plotgroupconfig["y2title"] ~= nil then
+            y2title = plotgroupconfig["y2title"]
+        elseif plotgroupconfig["y2metricmatch"] ~= nil then
+            y2title = plotgroupconfig["ytitle"]
+        end
+        for i,mconfig in pairs(groupdata["Metrics"]) do
+            local mmatch = "%a*"..plotgroupconfig["ymetricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+            if mconfig["description"]:match(mmatch) then
+                formulalist = {{name=mconfig["description"], index=i}}
+            end
+            if plotgroupconfig["y2metricmatch"] ~= nil then
+                mmatch = "%a*"..plotgroupconfig["y2metricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+                if mconfig["description"]:match(mmatch) then
+                    table.insert(formulalist, {name=mconfig["description"], index=i})
+                end
+            end
+        end
+    end
+
+    --[[for j,estr in pairs(likwid.stringsplit(outopts, ":")) do
+        if estr:match("^title=([%g%s]+)") then
+            title = estr:match("^title=([%g%s]+)")
+        elseif estr:match("^TITLE=([%g%s]+)") then
+            title = estr:match("^TITLE=([%g%s]+)")
+        elseif estr:match("ytitle=([%g%s]+)") then
+            ytitle = estr:match("ytitle=([%g%s]+)")
+        elseif estr:match("YTITLE=([%g%s]+)")then
+            ytitle = estr:match("YTITLE=([%g%s]+)")
+        elseif estr:match("y2title=(%d+)-([%g%s]+)") then
+            y2funcindex, y2title = estr:match("y2title=(%d+)-([%g%s]+)")
+        elseif estr:match("Y2TITLE=(%d+)-([%g%s]+)") then
+            y2funcindex, y2title = estr:match("Y2TITLE=(%d+)-([%g%s]+)")
+        elseif estr:match("y2title=([%g%s]+)") then
+            y2title = estr:match("y2title=([%g%s]+)")
+        elseif estr:match("Y2TITLE=([%g%s]+)") then
+            y2title = estr:match("Y2TITLE=([%g%s]+)")
+        elseif estr:match("xtitle=([%g%s]+)") then
+            xtitle = estr:match("xtitle=([%g%s]+)")
+        elseif estr:match("XTITLE=([%g%s]+)")then
+            xtitle = estr:match("XTITLE=([%g%s]+)")
+        elseif estr:match("[%g%s]+=[%g]+") then
+            fname, form = estr:match("([%g%s]+)=([%g]+)")
+            if formulalist == nil then
+                formulalist = {}
+            end
+            if groupdata["Metrics"] ~= nil then
+                for i,mconfig in pairs(groupdata["Metrics"]) do
+                    if mconfig["description"]:match(fname) then
+                        table.insert(formulalist, {name=fname, index=i})
+                        break
+                    end
+                end
+            else
+                table.insert(formulalist, {name=fname, formula=form})
+            end
+        end
+    end]]
+
+    group_list[i]["eventstring"] = event_string
+    group_list[i]["counterlist"] = {}
+    for k=1,#groupdata["Events"] do
+        table.insert(group_list[i]["counterlist"], groupdata["Events"][k]["Counter"])
+    end
+    if title then
+        group_list[i]["title"] = title
+    end
+    if ytitle then
+        group_list[i]["ytitle"] = ytitle
+    end
+    if y2title then
+        group_list[i]["y2title"] = y2title
+    end
+    if y2funcindex then
+        group_list[i]["y2funcindex"] = y2funcindex - 1
+    else
+        if formulalist ~= nil then
+            group_list[i]["y2funcindex"] = #formulalist - 1
+        end
+    end
+    if xtitle then
+        group_list[i]["xtitle"] = xtitle
+    end
+    if formulalist ~= nil then
+        group_list[i]["formulas"] = formulalist
+    else
+        group_list[i]["formulas"] = {}
+    end
+end
+
+cmd = ""
+if host ~= nil then
+    cmd = cmd .. "ssh "..host.. " \"/bin/bash -c \\\" "
+end
+cmd = cmd .. " " ..PERFCTR
+if pinning then
+    cmd = cmd .. string.format(" -C %s",table.concat(cpulist,","))
+else
+    cmd = cmd .. string.format(" -c %s",table.concat(cpulist,","))
+end
+if force then
+    cmd = cmd .. " -f"
+end
+cmd = cmd .. string.format(" -t %s", timeline)
+
+for i, group in pairs(group_list) do
+    cmd = cmd .. " -g "..group["eventstring"]
+end
+cmd = cmd .. " ".. table.concat(arg, " ")
+-- since io.popen can only read stdout we swap stdout and stderr
+-- application output is written to stderr, we catch stdout
+cmd = cmd .. " 3>&1 1>&2 2>&3 3>&-"
+if host ~= nil then
+    cmd = cmd .. " \\\" \" "
+end
+perfctr = assert (io.popen (cmd))
+
+
+for i, group in pairs(group_list) do
+    gnucmd = string.format("%s --stream %f --with linespoints --domain --nodataid", FEEDGNUPLOT, mfreq/#group_list)
+    if plotrange > 0 then
+        gnucmd = gnucmd .. string.format(" --xlen %d", plotrange)
+    else
+        gnucmd = gnucmd .. " --xmin 0"
+    end
+    if group["title"] ~= nil then
+        if #group_list > 1 then
+            gnucmd = gnucmd .. string.format(" --title %q", "Group "..i..": "..group["title"])
+        else
+            gnucmd = gnucmd .. string.format(" --title %q", group["title"])
+        end
+    end
+    if group["xtitle"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --xlabel %q", group["xtitle"])
+    else
+        gnucmd = gnucmd .. string.format(" --xlabel %q", "Time")
+    end
+    if group["ytitle"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --ylabel %q", group["ytitle"])
+    end
+    if group["y2title"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --y2 %d --y2label %q", group["y2funcindex"], group["y2title"])
+    end
+    if group["formulas"] then
+        if #cpulist == 1 then
+            for f, fdesc in pairs(group["formulas"]) do
+                gnucmd = gnucmd .. string.format(" --legend %d %q", f-1, fdesc["name"])
+            end
+        else
+            local curveID = 0
+            for c,cpu in pairs(cpulist) do
+                for f, fdesc in pairs(group["formulas"]) do
+                    gnucmd = gnucmd .. string.format(" --legend %d %q", curveID, "C"..cpu..": "..fdesc["name"])
+                    curveID = curveID + 1
+                end
+            end
+        end
+    end
+    gnucmd = gnucmd .. " --set 'key outside bmargin bottom'"
+    if plotdump then
+        gnucmd = gnucmd .. " --dump"
+    else
+        gnucmd = gnucmd .. " 1>/dev/null 2>&1"
+    end
+    group_list[i]["output"] = assert(io.popen(gnucmd,"w"))
+end
+
+
+likwid.catchSignal()
+local mtime = {}
+for i,g in pairs(group_list) do
+    local str = "0 "
+    for k,t in pairs(cpulist) do
+        for j,c in pairs(g["formulas"]) do
+            str = str .."0 "
+        end
+    end
+    mtime[i] = nil
+    g["output"]:write(str.."\n")
+    g["output"]:flush()
+    if dump then
+        print(tostring(i).." ".. str)
+    end
+end
+
+
+olddata = {}
+oldmetric = {}
+local perfctr_exited = false
+local oldtime = 0
+local clock = likwid.getCpuClock()
+while true do
+    local l = perfctr:read("*line")
+    if l == nil or l:match("^%s*$") then
+        break
+    end
+    if l:match("^%d+ %d+ %d+ [%d.]+ %d+") then
+        local data = {}
+        local diff = {}
+        linelist = likwid.stringsplit(l, " ")
+        group = tonumber(linelist[1])
+        nr_events = tonumber(linelist[2])
+        nr_threads = tonumber(linelist[3])
+        time = tonumber(linelist[4])
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+
+        for i=1,nr_events do
+            if data[i] == nil then data[i] = {} end
+            for j=1,nr_threads do
+                data[i][j] = tonumber(linelist[1])
+                table.remove(linelist, 1)
+            end
+        end
+
+        str = tostring(time)
+        for f, flist in pairs(group_list[group]["formulas"]) do
+            if flist["index"] ~= nil then
+                for i=1,nr_threads do
+                    str = str .." ".. data[flist["index"]][i]
+                end
+            end
+        end
+        
+        group_list[group]["output"]:write(str.."\n")
+        group_list[group]["output"]:flush()
+        if dump then
+            print(tostring(group).." ".. str)
+        end
+        oldtime = time
+    end
+end
+
+if perfctr_exited == false then
+    while likwid.getSignalState() == 0 do
+        likwid.sleep(1E6)
+    end
+end
+for i, group in pairs(group_list) do
+    group["output"]:write("exit\n")
+    io.close(group["output"])
+end
+io.close(perfctr)
+
+
+
diff --git a/src/applications/likwid-pin.c b/src/applications/likwid-pin.c
deleted file mode 100644
index 3d9e85b..0000000
--- a/src/applications/likwid-pin.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-pin.c
- *
- *      Description:  An application to pin a program including threads
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <numa.h>
-#include <memsweep.h>
-#include <strUtil.h>
-
-#ifdef COLOR
-#include <textcolor.h>
-#endif
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define HELP_MSG \
-    fprintf(stdout, "likwid-pin --  Version %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "\n"); \
-    fprintf(stdout, "Supported Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-i\t Set numa interleave policy with all involved numa nodes\n"); \
-    fprintf(stdout, "-S\t Sweep memory in involved numa nodes\n"); \
-    fprintf(stdout, "-c\t comma separated list of processor ids or expression\n"); \
-    fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
-    fprintf(stdout, "-p\t Print available domains with mapping on physical ids\n"); \
-    fprintf(stdout, "  \t If used together with -c option outputs a physical processor ids.\n"); \
-    fprintf(stdout, "-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \
-    fprintf(stdout, "-q\t Silent without output\n\n"); \
-    fprintf(stdout, "There are three possibilities to provide a thread to processor list:\n\n"); \
-    fprintf(stdout, "1. Thread list with physical or logical thread numberings and physical cores first.\n"); \
-    fprintf(stdout, "Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \
-    fprintf(stdout, "You can pin with the following numberings:\n");  \
-    fprintf(stdout, "\t1. Physical numbering of OS.\n");  \
-    fprintf(stdout, "\t2. Logical numbering inside node. e.g. -c N:0-3\n");  \
-    fprintf(stdout, "\t3. Logical numbering inside socket. e.g. -c S0:0-3\n");  \
-    fprintf(stdout, "\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n");  \
-    fprintf(stdout, "\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n");  \
-    fprintf(stdout, "\tYou can also mix domains separated by  @, e.g. -c S0:0-3 at S1:0-3 \n\n");  \
-    fprintf(stdout, "2. Expressions based thread list generation with compact processor numbering.\n"); \
-    fprintf(stdout, "Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \
-    fprintf(stdout, "This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n");  \
-    fprintf(stdout, "The following syntax variants are available:\n");  \
-    fprintf(stdout, "\t1. -c E:<thread domain>:<number of threads>\n");  \
-    fprintf(stdout, "\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>\n");  \
-    fprintf(stdout, "\t   For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n");  \
-    fprintf(stdout, "3. Scatter policy among thread domain type.\n"); \
-    fprintf(stdout, "Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \
-    fprintf(stdout, "This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n");  \
-    fprintf(stdout, "4. Logical pinning.\n"); \
-    fprintf(stdout, "Example usage logical pinning: likwid-pin -c L:0,3,4 ./myApp\n"); \
-    fprintf(stdout, "This will generate a mapping containing the processors with index 0, 3 and 4 in the currently available processor list.\n");  \
-    fprintf(stdout, "If you are running inside a cpuset (taskset, cgroup) the sorted list of allowed processors is taken as processor list.\n");  \
-    fprintf(stdout, "Example usage logical pinning inside cpuset:\n"); \
-    fprintf(stdout, "taskset -c 4,7,2,1,5 likwid-pin -c L:0,2,4 ./myApp\n"); \
-    fprintf(stdout, "This maps the application to the processors 1,4,7.\n\n");  \
-    fprintf(stdout, "If you ommit the -c option likwid will use all processors available on the node\n"); \
-    fprintf(stdout, "with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \
-    fprintf(stdout, "threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \
-    fprintf(stdout, "in your environment.\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-pin   %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-    static void
-pinPid(int cpuid, int silent)
-{
-    int status;
-    cpu_set_t cpuset;
-
-    CPU_ZERO(&cpuset);
-    CPU_SET(cpuid, &cpuset);
-
-    status = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
-
-    if (status == -1)
-    {
-        fprintf(stderr, "sched_setaffinity failed : %s \n",strerror(errno));
-    }
-    else
-    {
-        if(!silent)
-        {
-#ifdef COLOR
-            color_on(BRIGHT, COLOR);
-#endif
-            fprintf(stdout, "[likwid-pin] Main PID -> core %d - OK",  cpuid);
-#ifdef COLOR
-            color_reset();
-#endif
-            fprintf(stdout, "\n");
-            fflush(stdout);
-        }
-    }
-}
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main (int argc, char** argv)
-{
-    int i;
-    int c;
-    int skipMask = -1;
-    int optInterleaved = 0;
-    int optMemSweep = 0;
-    int optPrintDomains = 0;
-    int optSilent = 0;
-    int hasAffinity = 0;
-    bstring  pinString;
-    bstring  skipString;
-    bstring  argString;
-    int numThreads=0;
-    int threads[MAX_NUM_THREADS];
-    char delimiter = ',';
-    FILE* OUTSTREAM = stdout;
-    threads[0] = 0;
-
-    if (argc ==  1) {
-        HELP_MSG;
-        exit (EXIT_SUCCESS);
-    }
-
-    if (cpuid_init() == EXIT_SUCCESS)
-    {
-        numa_init();
-        affinity_init();
-        hasAffinity = 1;
-    }
-
-    while ((c = getopt (argc, argv, "+c:d:hipqs:Sv")) != -1)
-    {
-        switch (c)
-        {
-            case 'c':
-                CHECK_OPTION_STRING;
-                if (hasAffinity)
-                {
-                    numThreads = bstr_to_cpuset(threads, argString);
-                }
-                else
-                {
-                    numThreads = bstr_to_cpuset_physical((uint32_t*) threads, argString);
-                }
-
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-                break;
-            case 'd':
-                delimiter = optarg[0];
-                break;
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'i':
-                optInterleaved = 1;
-                break;
-            case 'p':
-                if (!hasAffinity)
-                {
-                    fprintf(stderr, "Option -p is not supported for unknown processor!\n");
-                    exit(EXIT_SUCCESS);
-                }
-                optPrintDomains = 1;
-                break;
-            case 'q':
-                optSilent = 1;
-                OUTSTREAM = NULL;
-                setenv("LIKWID_SILENT","true", 1);
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                skipMask = strtoul((char*) argString->data,NULL,16);
-                break;
-            case 'S':
-                if (!hasAffinity)
-                {
-                    fprintf(stderr, "Option -S is not supported for unknown processor!\n");
-                    exit(EXIT_SUCCESS);
-                }
-                optMemSweep = 1;
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            default:
-                HELP_MSG;
-                exit(EXIT_FAILURE);
-        }
-    }
-    if (optind == argc && !optPrintDomains)
-    {
-        fprintf(stderr,"Executable must be given on commandline\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPrintDomains && numThreads)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "%d",threads[0]);
-
-            for ( i=1; i< numThreads; i++)
-            {
-                fprintf(OUTSTREAM, "%c%d",delimiter,threads[i]);
-            }
-            fprintf(OUTSTREAM, "\n");
-            fflush(OUTSTREAM);
-        }
-        exit (EXIT_SUCCESS);
-    }
-    else if ( optPrintDomains )
-    {
-        affinity_printDomains(OUTSTREAM);
-        exit (EXIT_SUCCESS);
-    }
-
-    if (!numThreads)
-    {
-        argString = bformat("N:0-%u", cpuid_topology.numHWThreads-1);
-        numThreads = bstr_to_cpuset(threads, argString);
-    }
-
-    /* CPU List:
-     * pthread (default): pin main pid + all thread tids
-     *
-     * OpenMP: Pin OMP_NUM_THREADS
-     * intel openmp: pin main pid + all thread tids (skip thread 1)
-     * gcc openmp: pin main pid + all thread tids (one less)
-     */
-
-    if (optInterleaved)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "Set mem_policy to interleaved\n");
-            fflush(OUTSTREAM);
-        }
-        numa_setInterleaved(threads, numThreads);
-    }
-
-    if (optMemSweep)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "Sweeping memory\n");
-            fflush(OUTSTREAM);
-        }
-        memsweep_threadGroup(OUTSTREAM, threads, numThreads);
-    }
-
-    if ( getenv("OMP_NUM_THREADS") == NULL )
-    {
-        argString = bformat("%d",numThreads);
-        setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
-    }
-
-    if (numThreads > 1)
-    {
-        bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
-        pinString = bformat("%d",threads[1]);
-
-        for (i=2; i < numThreads;i++)
-        {
-            bformata(pinString,",%d",threads[i]);
-        }
-
-        bformata(pinString,",%d",threads[0]);
-
-        if (skipMask >= 0)
-        {
-            skipString = bformat("%d",skipMask);
-            setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1);
-        }
-
-        setenv("KMP_AFFINITY", "disabled", 1);
-        setenv("LIKWID_PIN",(char*) bdata(pinString) , 1);
-
-
-        if (ldPreload == NULL)
-        {
-            setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
-        }
-        else
-        {
-            bconchar(ldPreload, ':');
-            bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
-            setenv("LD_PRELOAD", bdata(ldPreload), 1);
-        }
-    }
-
-    pinPid(threads[0], optSilent);
-    fflush(stdout);
-
-    argv +=  optind;
-    execvp(argv[0], argv);
-    perror("execvp");
-    fprintf(stderr,"failed to execute %s\n", argv[0]);
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua
new file mode 100644
index 0000000..de57652
--- /dev/null
+++ b/src/applications/likwid-pin.lua
@@ -0,0 +1,275 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-pin.lua
+ *
+ *      Description:  An application to pin a program including threads
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-pin.lua --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("There are three possibilities to provide a thread to processor list:")
+    print("1. Thread list with physical thread IDs")
+    print("Example: likwid-pin.lua -c 0,4-6 ./myApp")
+    print("Pins the application to cores 0,4,5 and 6")
+    print("2. Thread list with logical thread numberings in physical cores first sorted list.")
+    print("Example usage thread list: likwid-pin.lua -c N:0,4-6 ./myApp")
+    print("You can pin with the following numberings:")
+    print("\t2. Logical numbering inside node.\n\t   e.g. -c N:0,1,2,3 for the first 4 physical cores of the node")
+    print("\t3. Logical numbering inside socket.\n\t   e.g. -c S0:0-1 for the first 2 physical cores of the socket")
+    print("\t4. Logical numbering inside last level cache group.\n\t   e.g. -c C0:0-3  for the first 4 physical cores in the first LLC")
+    print("\t5. Logical numbering inside NUMA domain.\n\t   e.g. -c M0:0-3 for the first 4 physical cores in the first NUMA domain")
+    print("\tYou can also mix domains separated by  @,\n\te.g. -c S0:0-3 at S1:0-3 for the 4 first physical cores on both sockets.")
+    print("3. Expressions based thread list generation with compact processor numbering.")
+    print("Example usage expression: likwid-pin.lua -c E:N:8 ./myApp")
+    print("This will generate a compact list of thread to processor mapping for the node domain")
+    print("with eight threads.")
+    print("The following syntax variants are available:")
+    print("\t1. -c E:<thread domain>:<number of threads>")
+    print("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>")
+    print("\tFor two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4")
+    print("4. Scatter policy among thread domain type.")
+    print("Example usage scatter: likwid-pin.lua -c M:scatter ./myApp")
+    print("This will generate a thread to processor mapping scattered among all memory domains")
+    print("with physical cores first.")
+    print("")
+    print("likwid-pin sets OMP_NUM_THREADS with as many threads as specified")
+    print("in your pin expression if OMP_NUM_THREADS is not present in your environment.")
+end
+
+local function usage()
+    version()
+    print("An application to pin a program including threads.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-i\t\t\t Set numa interleave policy with all involved numa nodes")
+    print("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
+    print("-c <list>\t\t Comma separated processor IDs or expression")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-p\t\t\t Print available domains with mapping on physical IDs")
+    print("\t\t\t If used together with -p option outputs a physical processor IDs.")
+    print("-d <string>\t\t Delimiter used for using -p to output physical processor list, default is comma.")
+    print("-q, --quiet\t\t Silent without output")
+    print("\n")
+    examples()
+end
+
+delimiter = ','
+quiet = 0
+sweep_sockets = false
+interleaved_policy = false
+print_domains = false
+cpu_list = {}
+skip_mask = nil
+affinity = nil
+num_threads = 0
+
+config = likwid.getConfiguration()
+cputopo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+
+if (#arg == 0) then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t:", "v", "V:", "verbose:", "help", "version", "skip","sweep", "quiet"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "c") then
+        if (affinity ~= nil) then
+            num_threads,cpu_list = likwid.cpustr_to_cpulist(arg)
+        else
+            num_threads,cpu_list = likwid.cpustr_to_cpulist_physical(arg)
+        end
+        if (num_threads == 0) then
+            print("Failed to parse cpulist " .. arg)
+            likwid.putTopology()
+            likwid.putAffinityInfo()
+            likwid.putConfiguration()
+            os.exit(1)
+        end
+    elseif (opt == "d") then
+        delimiter = arg
+    elseif opt == "S" or opt == "sweep" then
+        if (affinity == nil) then
+            print("Option -S is not supported for unknown processor!")
+            likwid.putTopology()
+            likwid.putAffinityInfo()
+            likwid.putConfiguration()
+            os.exit(1)
+        end
+        sweep_sockets = true
+    elseif (opt == "i") then
+        interleaved_policy = true
+    elseif (opt == "p") then
+        print_domains = true
+    elseif opt == "s" or opt == "skip" then
+        local s,e = arg:find("0x")
+        if s == nil then
+            print("Skip mask must be given in hex, hence start with 0x")
+            os.exit(1)
+        end
+        skip_mask = arg
+    elseif opt == "q" or opt == "quiet" then
+        likwid.setenv("LIKWID_SILENT","true")
+        quiet = 1
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+end
+
+
+if print_domains and num_threads > 0 then
+    outstr = ""
+    for i, cpu in pairs(cpu_list) do
+        outstr = outstr .. delimiter .. cpu
+    end
+    print(outstr:sub(2,outstr:len()))
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(0)
+elseif print_domains then
+    for k,v in pairs(affinity["domains"]) do
+        print(string.format("Domain %s:", v["tag"]))
+        print("\t" .. table.concat(v["processorList"], ","))
+        print("")
+    end
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if num_threads == 0 then
+    num_threads, cpu_list = likwid.cpustr_to_cpulist("N:0-"..cputopo["numHWThreads"]-1)
+end
+if (#arg == 0) then
+    print("Executable must be given on commandline")
+    os.exit(1)
+end
+
+if interleaved_policy then
+    print("Set mem_policy to interleaved")
+    likwid.setMemInterleaved(num_threads, cpu_list)
+end
+
+if sweep_sockets then
+    print("Sweeping memory")
+    likwid.memSweep(num_threads, cpu_list)
+end
+
+local omp_threads = os.getenv("OMP_NUM_THREADS")
+if omp_threads == nil then
+    likwid.setenv("OMP_NUM_THREADS",tostring(math.tointeger(num_threads)))
+elseif num_threads > tonumber(omp_threads) then
+    print(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_threads))
+end
+
+likwid.setenv("KMP_AFFINITY","disabled")
+
+if os.getenv("CILK_NWORKERS") == nil then
+    likwid.setenv("CILK_NWORKERS", tostring(math.tointeger(num_threads)))
+end
+if skip_mask then
+    likwid.setenv("LIKWID_SKIP", skip_mask)
+end
+
+if num_threads > 1 then
+    local pinString = tostring(math.tointeger(cpu_list[2]))
+    for i=3,likwid.tablelength(cpu_list) do
+        pinString = pinString .. "," .. tostring(math.tointeger(cpu_list[i]))
+    end
+    pinString = pinString .. "," .. tostring(math.tointeger(cpu_list[1]))
+    likwid.setenv("LIKWID_PIN", pinString)
+
+    local preload = os.getenv("LD_PRELOAD")
+    if preload == nil then
+        likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+    else
+        likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+    end
+    local ldpath = os.getenv("LD_LIBRARY_PATH")
+    local libpath = likwid.pinlibpath:match("([/%g]+)/%g+.so")
+    if ldpath == nil then
+        likwid.setenv("LD_LIBRARY_PATH", libpath)
+    elseif not ldpath:match(libpath) then
+        likwid.setenv("LD_LIBRARY_PATH", libpath..":"..ldpath)
+    end
+else
+    likwid.setenv("LIKWID_PIN", tostring(math.tointeger(cpu_list[1])))
+    likwid.pinProcess(cpu_list[1], quiet)
+end
+
+local exec = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+local pid = likwid.startProgram(exec, num_threads, cpu_list)
+if (pid == nil) then
+    print("Failed to execute command: ".. exec)
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+likwid.waitpid(pid)
+
+likwid.putAffinityInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-powermeter.c b/src/applications/likwid-powermeter.c
deleted file mode 100644
index 4daa393..0000000
--- a/src/applications/likwid-powermeter.c
+++ /dev/null
@@ -1,507 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-powermeter.c
- *
- *      Description:  An application to get information about power 
- *      consumption on architectures implementing the RAPL interface.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <lock.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <affinity.h>
-#include <perfmon.h>
-#include <power.h>
-#include <thermal.h>
-#include <bstrlib.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-fprintf(stdout, "\nlikwid-powermeter --  Version  %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \
-fprintf(stdout, "Options:\n"); \
-fprintf(stdout, "-h\t\t Help message\n"); \
-fprintf(stdout, "-v\t\t Version information\n"); \
-fprintf(stdout, "-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \
-fprintf(stdout, "-c <list>\t specify sockets to measure\n"); \
-fprintf(stdout, "-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \
-fprintf(stdout, "-s <duration>\t set measure duration in sec. (default 2s) \n"); \
-fprintf(stdout, "-p\t\t print dynamic clocking and CPI values (requires executable)\n\n");   \
-fprintf(stdout, "Usage: likwid-powermeter -s 4 -c 1 \n");  \
-fprintf(stdout, "Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n"); \
-fflush(stdout);
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-powermeter  %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    int socket_fd = -1;
-    int optInfo = 0;
-    int optClock = 0;
-    int optStethoscope = 0;
-    int optSockets = 0;
-    int optTemp = 0;
-    double runtime;
-    int hasDRAM = 0;
-    int hasPP0 = 0;
-    int hasPP1 = 0;
-    int c, i;
-    bstring argString;
-    bstring eventString = bfromcstr("CLOCK");
-    int numSockets=1;
-    int numThreads=0;
-    int threadsSockets[MAX_NUM_NODES*2];
-    int threads[MAX_NUM_THREADS];
-    const AffinityDomain* socketDomains[MAX_NUM_NODES*2];
-    threadsSockets[0] = 0;
-
-    if (argc == 1)
-    {
-        HELP_MSG;
-        exit (EXIT_SUCCESS);
-    }
-
-    while ((c = getopt (argc, argv, "+c:hiM:ps:vt")) != -1)
-    {
-        switch (c)
-        {
-            case 'c':
-                CHECK_OPTION_STRING;
-                numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString);
-                bdestroy(argString);
-                optSockets = 1;
-                break;
-
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'i':
-                optInfo = 1;
-                break;
-            case 'M':  /* Set MSR Access mode */
-                CHECK_OPTION_STRING;
-                accessClient_setaccessmode(str2int((char*) argString->data));
-                bdestroy(argString);
-                break;
-            case 'p':
-                optClock = 1;
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                optStethoscope = str2int((char*) argString->data);
-                bdestroy(argString);
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 't':
-                optTemp = 1;
-                break;
-            case '?':
-                if (optopt == 's' || optopt == 'M' || optopt == 'c')
-                {
-                    HELP_MSG;
-                }
-                else if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                exit( EXIT_FAILURE);
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to performance counters is locked.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optClock && optind == argc)
-    {
-        fprintf(stderr,"Commandline option -p requires an executable.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optSockets && !optStethoscope && optind == argc)
-    {
-        fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optStethoscope == 0 && optind == argc && !optInfo)
-    {
-        fprintf(stderr,"Either -s <seconds> or executable must be given on commandline.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        fprintf(stderr, "CPU not supported\n");
-        exit(EXIT_FAILURE);
-    }
-    if (numSockets > cpuid_topology.numSockets)
-    {
-        fprintf(stderr, "System has only %d sockets but %d are given on commandline.\n",
-                        cpuid_topology.numSockets, numSockets);
-        exit(EXIT_FAILURE);
-    }
-
-    numa_init();
-    affinity_init();
-
-    for (c = 0; c < numSockets; c++)
-    {
-        if (threadsSockets[c] >= cpuid_topology.numSockets)
-        {
-            fprintf(stderr, "System has no socket %d\n", threadsSockets[c]);
-            exit(EXIT_FAILURE);
-        }
-        bstring socketStr = bformat("S%d",threadsSockets[c]);
-        socketDomains[threadsSockets[c]] = affinity_getDomain(socketStr);
-    }
-
-    accessClient_init(&socket_fd);
-    msr_init(socket_fd);
-    timer_init();
-
-    /* check for supported processors */
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-            (cpuid_info.model == SANDYBRIDGE) ||
-            (cpuid_info.model == IVYBRIDGE) ||
-            (cpuid_info.model == IVYBRIDGE_EP) ||
-            (cpuid_info.model == HASWELL) ||
-            (cpuid_info.model == HASWELL_EX) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == ATOM_SILVERMONT_C) ||
-            (cpuid_info.model == ATOM_SILVERMONT_E) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F3))
-    {
-        if (numSockets == 0)
-        {
-            numSockets = numa_info.numberOfNodes;
-        }
-        for(int i=0; i<numSockets; i++)
-        {
-            power_init(socketDomains[threadsSockets[i]]->processorList[0]);
-        }
-    }
-    else
-    {
-        fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell/Silvermont processors!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    double clock = (double) timer_getCpuClock();
-
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fprintf(stdout, "CPU clock:\t%3.2f GHz \n",  (float) clock * 1.E-09);
-    fprintf(stdout, HLINE);
-    fflush(stdout);
-
-    if (optInfo)
-    {
-        if (power_info.turbo.numSteps != 0)
-        {
-            fprintf(stdout, "Base clock:\t%.2f MHz \n",  power_info.baseFrequency );
-            fprintf(stdout, "Minimal clock:\t%.2f MHz \n",  power_info.minFrequency );
-            fprintf(stdout, "Turbo Boost Steps:\n");
-            for (int i=0; i < power_info.turbo.numSteps; i++ )
-            {
-                fprintf(stdout, "C%d %.2f MHz \n",i+1,  power_info.turbo.steps[i] );
-            }
-        }
-        fprintf(stdout, HLINE);
-        fflush(stdout);
-    }
-
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-        (cpuid_info.model == IVYBRIDGE_EP) ||
-        (cpuid_info.model == HASWELL_EX) ||
-        (cpuid_info.model == HASWELL))
-    {
-        hasDRAM = 1;
-    }
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-        (cpuid_info.model == SANDYBRIDGE) ||
-        (cpuid_info.model == IVYBRIDGE_EP) ||
-        (cpuid_info.model == IVYBRIDGE) ||
-        (cpuid_info.model == HASWELL) ||
-        (cpuid_info.model == ATOM_SILVERMONT_E) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F3))
-    {
-        hasPP0 = 1;
-    }
-    if ((cpuid_info.model == HASWELL) ||
-        (cpuid_info.model == SANDYBRIDGE) ||
-        (cpuid_info.model == IVYBRIDGE))
-    {
-        hasPP1 = 1;
-    }
-    if ((cpuid_info.model != SANDYBRIDGE) &&
-        (cpuid_info.model != SANDYBRIDGE_EP)  &&
-        (cpuid_info.model != IVYBRIDGE)  &&
-        (cpuid_info.model != IVYBRIDGE_EP)  &&
-        (cpuid_info.model != HASWELL) &&
-        (cpuid_info.model != HASWELL_M1) &&
-        (cpuid_info.model != HASWELL_M2) &&
-        (cpuid_info.model != HASWELL_EX) &&
-        (cpuid_info.model != ATOM_SILVERMONT_C) &&
-        (cpuid_info.model != ATOM_SILVERMONT_E) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F1) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F2) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F3))
-    {
-        fprintf (stderr, "RAPL not supported on this processor!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (optInfo)
-    {
-        fprintf(stdout, "Thermal Spec Power: %g Watts \n", power_info.tdp );
-        fprintf(stdout, "Minimum  Power: %g Watts \n", power_info.minPower);
-        fprintf(stdout, "Maximum  Power: %g Watts \n", power_info.maxPower);
-        fprintf(stdout, "Maximum  Time Window: %g micro sec \n", power_info.maxTimeWindow);
-        fprintf(stdout, HLINE);
-        fflush(stdout);
-        exit(EXIT_SUCCESS);
-    }
-
-    if (optClock)
-    {
-        affinity_init();
-        argString = bformat("S%u:0-%u", threadsSockets[0],
-                        socketDomains[threadsSockets[0]]->numberOfProcessors-1);
-        for (int i=1; i<numSockets; i++)
-        {
-            bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i],
-                                socketDomains[threadsSockets[i]]->numberOfProcessors-1);
-            bconcat(argString, tExpr);
-        }
-        numThreads = bstr_to_cpuset(threads, argString);
-        bdestroy(argString);
-        perfmon_init(numThreads, threads, stdout);
-        perfmon_setupEventSet(eventString, NULL);
-    }
-
-    {
-        PowerData pDataPkg[MAX_NUM_NODES*2];
-        PowerData pDataDram[MAX_NUM_NODES*2];
-        PowerData pDataPP0[MAX_NUM_NODES*2];
-        PowerData pDataPP1[MAX_NUM_NODES*2];
-        fprintf(stdout, "Measure on sockets: %d", threadsSockets[0]);
-        for (int i=1; i<numSockets; i++)
-        {
-            fprintf(stdout, ", %d", threadsSockets[i]);
-        }
-        fprintf(stdout, "\n");
-        fflush(stdout);
-
-        if (optStethoscope)
-        {
-            if (optClock)
-            {
-                perfmon_startCounters();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
-                    power_start(&(pDataPkg[i]), cpuId, PKG);
-                }
-            }
-            sleep(optStethoscope);
-
-            if (optClock)
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-                perfmon_finalize();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    power_stop(&(pDataPkg[i]), cpuId, PKG);
-                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
-                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
-                }
-            }
-            runtime = (double) optStethoscope;
-        }
-        else
-        {
-            TimerData time;
-            argv +=  optind;
-            bstring exeString = bfromcstr(argv[0]);
-
-            for (int i=1; i<(argc-optind); i++)
-            {
-                bconchar(exeString, ' ');
-                bcatcstr(exeString, argv[i]);
-            }
-            fprintf(stdout, "Executing: %s\n",bdata(exeString));
-            fflush(stdout);
-
-
-            if (optClock)
-            {
-                perfmon_startCounters();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
-                    power_start(&(pDataPkg[i]), cpuId, PKG);
-                }
-
-                timer_start(&time);
-            }
-
-            if (system(bdata(exeString)) == EOF)
-            {
-                fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-                exit(EXIT_FAILURE);
-            }
-
-            if (optClock)
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-                perfmon_finalize();
-            }
-            else
-            {
-                timer_stop(&time);
-
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    power_stop(&(pDataPkg[i]), cpuId, PKG);
-                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
-                }
-                runtime = timer_print(&time);
-            }
-        }
-
-        if (!optClock)
-        {
-            fprintf(stdout, "Runtime: %g second \n",runtime);
-            fprintf(stdout, HLINE);
-            for (int i=0; i<numSockets; i++)
-            {
-                fprintf(stdout, "Socket %d (Measured on CPU %d)\n",threadsSockets[i],
-                                    socketDomains[threadsSockets[i]]->processorList[0]);
-                fprintf(stdout, "Domain: PKG \n");
-                fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPkg[i])));
-                fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPkg[i])) / runtime );
-                if (hasDRAM)
-                {
-                    fprintf(stdout, "Domain: DRAM \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataDram[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataDram[i])) / runtime );
-                }
-                if (hasPP0)
-                {
-                    fprintf(stdout, "Domain: PP0 \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP0[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP0[i])) / runtime );
-                }
-                if (hasPP1)
-                {
-                    fprintf(stdout, "Domain: PP1 \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP1[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP1[i])) / runtime );
-                }
-                fprintf(stdout, "\n");
-            }
-            fflush(stdout);
-        }
-    }
-
-
-    if ( optTemp && cpuid_hasFeature(TM2))
-    {
-        printf("Current core temperatures:\n");
-        for (i = 0; i < numSockets; i++)
-        {
-            printf("Socket %d\n",threadsSockets[i]);
-            for (c = 0; c < socketDomains[threadsSockets[i]]->numberOfProcessors; c++ )
-            {
-                thermal_init(i);
-                printf("Core %d: %u C\n",
-                        socketDomains[threadsSockets[i]]->processorList[c],
-                        thermal_read(socketDomains[threadsSockets[i]]->processorList[c]));
-            }
-        }
-    }
-
-
-    msr_finalize();
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-powermeter.lua b/src/applications/likwid-powermeter.lua
new file mode 100644
index 0000000..3aa742f
--- /dev/null
+++ b/src/applications/likwid-powermeter.lua
@@ -0,0 +1,388 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-powermeter.lua
+ *
+ *      Description:  An application to get information about power 
+ *      consumption on architectures implementing the RAPL interface.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-powermeter --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Measure the power consumption for 4 seconds on socket 1")
+    print("likwid-powermeter -s 4 -c 1")
+    print("")
+    print("Use it as wrapper for an application to measure the energy for the whole execution")
+    print("likwid-powermeter -c 1 ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to print power and clocking information on x86 CPUs.\n")
+    print("Options:")
+    print("-h, --help\t Help message")
+    print("-v, --version\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+    print("-c <list>\t\t Specify sockets to measure")
+    print("-i, --info\t Print information from MSR_PKG_POWER_INFO register and Turbo mode")
+    print("-s <duration>\t Set measure duration in us, ms or s. (default 2s)")
+    print("-p\t\t Print dynamic clocking and CPI values, uses likwid-perfctr")
+    print("-t\t\t Print current temperatures of all CPU cores")
+    print("-f\t\t Print current temperatures in Fahrenheit")
+    print("")
+    examples()
+end
+
+local config = likwid.getConfiguration();
+
+print_info = false
+use_perfctr = false
+stethoscope = false
+fahrenheit = false
+print_temp = false
+verbose = 0
+if config["daemonMode"] < 0 then
+    access_mode = 1
+else
+    access_mode = config["daemonMode"]
+end
+time_interval = 2.E06
+time_orig = "2s"
+read_interval = 30.E06
+sockets = {}
+domainList = {"PKG", "PP0", "PP1", "DRAM"}
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+numatopo = likwid.getNumaInfo()
+affinity = likwid_getAffinityInfo()
+
+for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "f", "t", "help", "info", "version", "verbose:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        num_sockets, sockets = likwid.sockstr_to_socklist(arg)
+        if num_sockets == 0 then
+            os.exit(1)
+        end
+    elseif (opt == "M") then
+        access_mode = tonumber(arg)
+        if (access_mode == nil) then
+            print("Access mode (-M) must be an number")
+            usage()
+            os.exit(1)
+        elseif (access_mode < 0) or (access_mode > 1) then
+            print(string.format("Access mode (-M) %d not valid.",access_mode))
+            usage()
+            os.exit(1)
+        end
+        
+    elseif opt == "i" or opt == "info" then
+        print_info = true
+    elseif (opt == "p") then
+        use_perfctr = true
+    elseif (opt == "f") then
+        fahrenheit = true
+        print_temp = true
+    elseif (opt == "t") then
+        print_temp = true
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "s") then
+        time_interval = likwid.parse_time(arg)
+        time_orig = arg
+        stethoscope = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+
+
+cpulist = {}
+before = {}
+after = {}
+if #sockets > 0 then
+    for i,socketId in pairs(sockets) do
+        local affinityID = "S"..tostring(socketId)
+        for j, domain in pairs(affinity["domains"]) do
+            if domain["tag"] == affinityID then
+                table.insert(cpulist,domain["processorList"][1])
+                before[domain["processorList"][1]] = {}
+                after[domain["processorList"][1]] = {}
+                for _, id in pairs(domainList) do
+                    before[domain["processorList"][1]][id] = 0
+                    after[domain["processorList"][1]][id] = 0
+                end
+            end
+        end
+    end
+else
+    for j, domain in pairs(affinity["domains"]) do
+        if domain["tag"]:match("S%d+") then
+            table.insert(cpulist,domain["processorList"][1])
+            table.insert(sockets, domain["tag"]:match("S(%d+)"))
+            before[domain["processorList"][1]] = {}
+            after[domain["processorList"][1]] = {}
+            for _, id in pairs(domainList) do
+                before[domain["processorList"][1]][id] = 0
+                after[domain["processorList"][1]][id] = 0
+            end
+        end
+    end
+end
+
+
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+    os.exit(1)
+end
+
+power = likwid.getPowerInfo()
+if not power then
+    print(string.format("The %s does not support reading power data",cpuinfo["name"]))
+    os.exit(1)
+end
+
+
+if not use_perfctr then
+    print(likwid.hline);
+    print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+    print(string.format("CPU type:\t%s",cpuinfo["name"]))
+    if cpuinfo["clock"] > 0 then
+        print(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] *  1.E-09))
+    else
+        print(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() *  1.E-09))
+    end
+    print(likwid.hline)
+end
+
+if print_info or verbose > 0 then
+    if (power["turbo"]["numSteps"] > 0) then
+        print(string.format("Base clock:\t%.2f MHz", power["baseFrequency"]))
+        print(string.format("Minimal clock:\t%.2f MHz", power["minFrequency"]))
+        print("Turbo Boost Steps:")
+        for i,step in pairs(power["turbo"]["steps"]) do
+            print(string.format("C%d %.2f MHz",i-1,power["turbo"]["steps"][i]))
+        end
+    end
+    print(likwid.hline)
+end
+
+if power["hasRAPL"] == 0 then
+    print("Measuring power is not supported on this machine")
+    os.exit(1)
+end
+
+if (print_info) then
+    for i, dname in pairs(domainList) do
+        local domain = power["domains"][dname]
+        if domain["supportInfo"] then
+            print(string.format("Info for RAPL domain %s:", dname));
+            print(string.format("Thermal Spec Power: %g Watt",domain["tdp"]*1E-6))
+            print(string.format("Minimum Power: %g Watt",domain["minPower"]*1E-6))
+            print(string.format("Maximum Power: %g Watt",domain["maxPower"]*1E-6))
+            print(string.format("Maximum Time Window: %g micro sec",domain["maxTimeWindow"]))
+            print()
+        end
+    end
+    print(likwid.hline)
+end
+
+if (stethoscope) and (time_interval < power["timeUnit"]) then
+    print("Time interval too short, minimum measurement time is "..tostring(power["timeUnit"]).. " us")
+    os.exit(1)
+end
+
+local execString = ""
+if use_perfctr then
+    affinity = likwid.getAffinityInfo()
+    argString = ""
+    for i,socket in pairs(sockets) do
+        argString = argString .. string.format("S%u:0-%u",socket,(cputopo["numCoresPerSocket"]*cputopo["numThreadsPerCore"])-1)
+        if (i < #sockets) then
+            argString = argString .. "@"
+        end
+    end
+    execString = string.format("<INSTALLED_PREFIX>/bin/likwid-perfctr -C %s -f -g CLOCK ",argString)
+end
+
+
+if #arg == 0 then
+    if use_perfctr then
+        execString = execString .. string.format(" -S %s ", time_orig)
+        stethoscope = false
+    else
+        stethoscope = true
+    end
+else
+    if use_perfctr then
+        execString = execString .. table.concat(arg," ",1, likwid.tablelength(arg)-2)
+    else
+        execString = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+    end
+end
+
+if not print_info and not print_temp then
+    if stethoscope or (#arg > 0 and not use_perfctr) then
+        for i,socket in pairs(sockets) do
+            cpu = cpulist[i]
+            for idx, dom in pairs(domainList) do
+                if (power["domains"][dom]["supportStatus"]) then before[cpu][dom] = likwid.startPower(cpu, idx) end
+            end
+        end
+
+        time_before = likwid.startClock()
+        if stethoscope then
+            if read_interval < time_interval then
+                while ((read_interval <= time_interval) and (time_interval > 0)) do
+                    likwid.sleep(read_interval)
+                    for i,socket in pairs(sockets) do
+                        cpu = cpulist[i]
+                        for idx, dom in pairs(domainList) do
+                            if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+                        end
+                    end
+                    time_interval = time_interval - read_interval
+                    if time_interval < read_interval then
+                        read_interval = time_interval
+                    end
+                end
+            else
+                likwid.sleep(time_interval)
+            end
+        else
+            local pid = likwid.startProgram(execString, 0, {})
+            if not pid then
+                print(string.format("Failed to execute %s!",execString))
+                likwid.finalize()
+                os.exit(1)
+            end
+            while true do
+                if likwid.getSignalState() ~= 0 then
+                    likwid.killProgram()
+                    break
+                end
+                local remain = likwid.sleep(read_interval)
+                for i,socket in pairs(sockets) do
+                    cpu = cpulist[i]
+                    for idx, dom in pairs(domainList) do
+                        if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+                    end
+                end
+                if remain > 0 or not likwid.checkProgram(pid) then
+                    io.stdout:flush()
+                    break
+                end
+            end
+        end
+        time_after = likwid.stopClock()
+
+        for i,socket in pairs(sockets) do
+            cpu = cpulist[i]
+            for idx, dom in pairs(domainList) do
+                if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+            end
+        end
+        runtime = likwid.getClock(time_before, time_after)
+
+        print(likwid.hline)
+        print(string.format("Runtime: %g s",runtime))
+
+        for i,socket in pairs(sockets) do
+            cpu = cpulist[i]
+            print(string.format("Measure for socket %d on CPU %d", socket,cpu ))
+            for j, dom in pairs(domainList) do
+                if power["domains"][dom]["supportStatus"] then
+                    local energy = likwid.calcPower(before[cpu][dom], after[cpu][dom], 0)
+                    print(string.format("Domain %s:", dom))
+                    print(string.format("Energy consumed: %g Joules",energy))
+                    print(string.format("Power consumed: %g Watt",energy/runtime))
+                end
+            end
+            if i < #sockets then print("") end
+        end
+        print(likwid.hline)
+    else
+        err = os.execute(execString)
+        if err == false then
+            print(string.format("Failed to execute %s!",execString))
+            likwid.putPowerInfo()
+            likwid.finalize()
+            os.exit(1)
+        end
+    end
+end
+
+if print_temp and (string.find(cpuinfo["features"],"TM2") ~= nil) then
+    print(likwid.hline)
+    print("Current core temperatures:");
+    for i=1,cputopo["numSockets"] do
+        local tag = "S" .. tostring(i-1)
+        for _, domain in pairs(affinity["domains"]) do
+            if domain["tag"] == tag then
+                for j=1,#domain["processorList"] do
+                    local cpuid = domain["processorList"][j]
+                    likwid.initTemp(cpuid);
+                    if (fahrenheit) then
+                        local f = 1.8*tonumber(likwid.readTemp(cpuid))+32
+                        print(string.format("Socket %d Core %d: %.0f F",i-1,cpuid, f));
+                    else
+                        print(string.format("Socket %d Core %d: %.0f C",i-1,cpuid, tonumber(likwid.readTemp(cpuid))));
+                    end
+                end
+            end
+        end
+    end
+    print(likwid.hline)
+end
+
+likwid.putPowerInfo()
+likwid.finalize()
diff --git a/src/applications/likwid-setFrequencies.lua b/src/applications/likwid-setFrequencies.lua
new file mode 100644
index 0000000..7a56921
--- /dev/null
+++ b/src/applications/likwid-setFrequencies.lua
@@ -0,0 +1,396 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-setFrequencies.lua
+ *
+ *      Description:  A application to set the CPU frequency of CPU cores and domains.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+sys_base_path = "/sys/devices/system/cpu"
+set_command = "<INSTALLED_PREFIX>/sbin/likwid-setFreq"
+
+
+function version()
+    print(string.format("likwid-setFrequencies --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to adjust frequencies and governors on x86 CPUs.\n")
+    print("Options:")
+    print("-h\t Help message")
+    print("-v\t Version information")
+    print("-c dom\t Likwid thread domain which to apply settings (default are all CPUs)")
+    print("\t See likwid-pin -h for details")
+    print("-g gov\t Set governor (" .. table.concat(getAvailGovs(nil), ", ") .. ") (set to ondemand if omitted)")
+    print("-f freq\t Set fixed frequency, implicitly sets userspace governor")
+    print("-p\t Print current frequencies")
+    print("-l\t List available frequencies")
+    print("-m\t List available governors")
+end
+
+function getCurrentMinFreq(cpuid)
+    local min = 10000000
+    if cpuid == nil or cpuid < 0 then
+        for cpuid=0,topo["numHWThreads"]-1 do
+            fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
+            if verbosity == 3 then
+                print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+            end
+            line = fp:read("*l")
+            if tonumber(line)/1E6 < min then
+                min = tonumber(line)/1E6
+            end
+            fp:close()
+        end
+    else
+        fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_min_freq" )
+        end
+        line = fp:read("*l")
+        if tonumber(line)/1E6 < min then
+            min = tonumber(line)/1E6
+        end
+        fp:close()
+    end
+    return min
+end
+
+function getCurrentMaxFreq(cpuid)
+    local max = 0
+    if cpuid == nil or cpuid < 0 then
+        for cpuid=0,topo["numHWThreads"]-1 do
+            fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
+            if verbosity == 3 then
+                print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+            end
+            line = fp:read("*l")
+            if tonumber(line)/1E6 > max then
+                max = tonumber(line)/1E6
+            end
+            fp:close()
+        end
+    else
+        fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_max_freq" )
+        end
+        line = fp:read("*l")
+        if tonumber(line)/1E6 > max then
+            max = tonumber(line)/1E6
+        end
+        fp:close()
+    end
+    return max
+end
+
+
+function getAvailFreq(cpuid)
+    if cpuid == nil then
+        cpuid = 0
+    end
+    if cpuid < 0 then
+        cpuid = 0
+    end
+    fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies" )
+    end
+    line = fp:read("*l")
+    fp:close()
+    
+    local tmp = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), " ", nil, " ")
+    local avail = {}
+    local turbo = tonumber(tmp[1])/1E6
+    local j = 1
+    for i=2,#tmp do
+        local freq = tonumber(tmp[i])/1E6
+        avail[j] = tostring(freq)
+        if not avail[j]:match("%d+.%d+") then
+            avail[j] = avail[j] ..".0"
+        end
+        j = j + 1
+    end
+    if verbosity == 1 then
+        print(string.format("The system provides %d scaling frequencies, frequency %s is taken as turbo mode", #avail,turbo))
+    end
+    return avail, tostring(turbo)
+end
+
+function getCurFreq()
+    local freqs = {}
+    local govs = {}
+    for cpuid=0,topo["numHWThreads"]-1 do
+        local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq" )
+        end
+        local line = fp:read("*l")
+        fp:close()
+        freqs[cpuid] = tostring(tonumber(line)/1E6)
+        if not freqs[cpuid]:match("%d.%d") then
+            freqs[cpuid] = freqs[cpuid] ..".0"
+        end
+        local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor" )
+        end
+        local line = fp:read("*l")
+        fp:close()
+        govs[cpuid] = line
+    end
+    return freqs, govs
+end
+
+function getAvailGovs(cpuid)
+    if (cpuid == nil) or (cpuid < 1) then
+        cpuid = 0
+    end
+    local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors" )
+    end
+    local line = fp:read("*l")
+    fp:close()
+    local avail = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+    for i=1,#avail do
+        if avail[i] == "userspace" then
+            table.remove(avail, i)
+            break
+        end
+    end
+    table.insert(avail, "turbo")
+    if verbosity == 1 then
+        print(string.format("The system provides %d scaling governors", #avail))
+    end
+    return avail
+end
+
+local function testDriver()
+    local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver" )
+    end
+    local line = fp:read("*l")
+    fp:close()
+    if line == "acpi-cpufreq" then
+        return true
+    end
+    return false
+end
+
+verbosity = 0
+governor = nil
+frequency = nil
+domain = nil
+printCurFreq = false
+printAvailFreq = false
+printAvailGovs = false
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+
+for opt,arg in likwid.getopt(arg, {"g:", "c:", "f:", "l", "p", "h", "v", "m", "help","version","freq:"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        domain = arg
+    elseif (opt == "g") then
+        governor = arg
+    elseif opt == "f" or opt == "freq" then
+        frequency = arg
+    elseif (opt == "p") then
+        printCurFreq = true
+    elseif (opt == "l") then
+        printAvailFreq = true
+    elseif (opt == "m") then
+        printAvailGovs = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+if not testDriver() then
+    print("The system does not use the acpi-cpufreq driver, other drivers are not usable with likwid-setFrequencies.")
+    os.exit(1)
+end
+
+topo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+if not domain or domain == "N" then
+    domain = "N:0-" .. tostring(topo["numHWThreads"]-1)
+end
+if domain:match("[SCM]%d") then
+    for i, dom in pairs(affinity["domains"]) do
+        if dom["tag"]:match(domain) then
+            domain = domain..":0-"..tostring(dom["numberOfProcessors"]-1)
+        end
+    end
+end
+cpulist = {}
+numthreads, cpulist = likwid.cpustr_to_cpulist(domain)
+if verbosity == 3 then
+    print(string.format("Given CPU expression expands to %d CPU cores:", numthreads))
+    local str = tostring(cpulist[1])
+    for i=2, numthreads  do
+        str = str .. "," .. tostring(cpulist[i])
+    end
+    print(str)
+end
+
+
+if printAvailGovs then
+    local govs = getAvailGovs(nil)
+    print("Available governors:")
+    print(table.concat(govs, ", "))
+end
+
+if printAvailFreq then
+    print("Available frequencies:")
+    local out = {}
+    local i = 1;
+    local freqs, turbo = getAvailFreq(nil)
+    if turbo ~= "0" then
+        table.insert(out, turbo)
+    end
+    for i=1,#freqs do
+        table.insert(out, freqs[i])
+    end
+
+    print(table.concat(out, " "))
+end
+
+if printCurFreq then
+    print("Current frequencies:")
+    local freqs = {}
+    local govs = {}
+    freqs, govs = getCurFreq()
+    for i=1,#cpulist do
+        print(string.format("CPU %d: governor %12s frequency %5s GHz",cpulist[i],govs[cpulist[i]], freqs[cpulist[i]]))
+    end
+end
+
+if printAvailGovs or printAvailFreq or printCurFreq then
+    os.exit(0)
+end
+
+if numthreads > 0 and not (frequency or governor) then
+    print("You need to set either a frequency or governor for the selected CPUs on commandline")
+    os.exit(1)
+end
+
+if frequency then
+    for i=1,#cpulist do
+        local freqs, turbo = getAvailFreq(cpulist[i])
+        local valid_freq = false
+        for k,v in pairs(freqs) do
+            if (frequency == v) then
+                valid_freq = true
+                break
+            end
+        end
+        if frequency == turbo then
+            valid_freq = true
+        end
+        if not valid_freq then
+            print(string.format("Frequency %s not available for CPU %d! Please select one of\n%s", frequency, cpulist[i], table.concat(freqs, ", ")))
+            os.exit(1)
+        end
+    
+        local cmd = set_command .. " " .. tostring(cpulist[i]) .. " " .. tostring(tonumber(frequency)*1E6)
+        if governor then
+            cmd = cmd .. " " .. governor
+        end
+        if verbosity == 3 then
+            print("Execute: ".. cmd)
+        end
+        local err = os.execute(cmd)
+        if err == false or err == nil then
+            print("Failed to set frequency for CPU "..tostring(cpulist[i]))
+        end
+    end
+    if governor then
+        governor = nil
+    end
+end
+
+if governor then
+    local govs = getAvailGovs(nil)
+    local freqs, turbo = getAvailFreq(nil)
+    local cur_freqs, cur_govs = getCurFreq()
+    local valid_gov = false
+    for k,v in pairs(govs) do
+        if (governor == v) then
+            valid_gov = true
+            break
+        end
+    end
+    if governor == "turbo" and turbo ~= "0" then
+        valid_gov = true
+        for i=1,#cpulist do
+            cur_freqs[cpulist[i]] = turbo
+        end
+    end
+    if not valid_gov then
+        print(string.format("Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
+        os.exit(1)
+    end
+    for i=1,#cpulist do
+        if governor ~= cur_govs[cpulist[i]] then
+            local cmd = set_command .. " " .. tostring(cpulist[i]) .. " "
+            if governor == "turbo" then
+                cmd = cmd .. tostring(tonumber(turbo)*1E6)
+            else
+                cmd = cmd .. tostring(tonumber(cur_freqs[cpulist[i]])*1E6) .. " " .. governor
+            end
+            if verbosity == 3 then
+                print("Execute: ".. cmd)
+            end
+            local err = os.execute(cmd)
+            if err == false or err == nil then
+                print("Failed to set governor for CPU "..tostring(cpulist[i]))
+            end
+        end
+    end
+end
+likwid.putAffinityInfo()
+likwid.putTopology()
+os.exit(0)
diff --git a/src/applications/likwid-topology.c b/src/applications/likwid-topology.c
deleted file mode 100644
index 7ba0e33..0000000
--- a/src/applications/likwid-topology.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-topology.c
- *
- *      Description:  A application to determine the thread and cache topology
- *                    on x86 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <timer.h>
-#include <affinity.h>
-#include <numa.h>
-#include <cpuFeatures.h>
-#include <tree.h>
-#include <asciiBoxes.h>
-#include <strUtil.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(OUTSTREAM, "\nlikwid-topology --  Version %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(OUTSTREAM, "A tool to print the thread and cache topology on x86 CPUs.\n"); \
-    fprintf(OUTSTREAM, "Options:\n"); \
-    fprintf(OUTSTREAM, "-h\t Help message\n"); \
-    fprintf(OUTSTREAM, "-v\t Version information\n"); \
-    fprintf(OUTSTREAM, "-c\t list cache information\n"); \
-    fprintf(OUTSTREAM, "-C\t measure processor clock\n"); \
-    fprintf(OUTSTREAM, "-o\t Store output to file, with output conversion according to file suffix\n"); \
-    fprintf(OUTSTREAM, "\t Conversion scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-    fprintf(OUTSTREAM, "-g\t graphical output\n\n"); \
-    fflush(OUTSTREAM);
-
-#define VERSION_MSG \
-    fprintf(OUTSTREAM, "likwid-topology  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(OUTSTREAM);
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main (int argc, char** argv)
-{
-    int optGraphical = 0;
-    int optCaches = 0;
-    int optClock = 0;
-    int c;
-    int tmp;
-    TreeNode* socketNode;
-    TreeNode* coreNode;
-    TreeNode* threadNode;
-    BoxContainer* container;
-    bstring  argString;
-    bstring  filterScript = bfromcstr("NO");
-    FILE* OUTSTREAM = stdout;
-
-    while ((c = getopt (argc, argv, "hvcCgo:")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'g':
-                optGraphical = 1;
-                break;
-            case 'c':
-                optCaches = 1;
-                break;
-            case 'C':
-                optClock = 1;
-                break;
-            case 'o':
-                if (! (argString = bSecureInput(200,optarg)))
-                {
-                    fprintf(stderr, "Failed to read argument string!\n");
-                }
-
-                OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
-                if(!OUTSTREAM)
-                {
-                    fprintf(stderr, "Failed to parse out file pattern.\n");
-                }
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    affinity_init();
-    numa_init();
-
-    fprintf(OUTSTREAM, HLINE);
-    fprintf(OUTSTREAM, "CPU type:\t%s\n",cpuid_info.name);
-
-    if (optClock)
-    {
-        timer_init();
-        fprintf(OUTSTREAM, "CPU clock:\t%3.2f GHz\n",  (float) timer_getCpuClock() * 1.E-09);
-    }
-
-    /*----------------------------------------------------------------------
-     *  Thread Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Hardware Thread Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Sockets:\t%u \n", cpuid_topology.numSockets);
-    fprintf(OUTSTREAM, "Cores per socket:\t%u \n", cpuid_topology.numCoresPerSocket);
-    fprintf(OUTSTREAM, "Threads per core:\t%u \n", cpuid_topology.numThreadsPerCore);
-    fprintf(OUTSTREAM, HLINE);
-    fprintf(OUTSTREAM, "HWThread\tThread\t\tCore\t\tSocket\n");
-
-    for ( uint32_t i=0; i <  cpuid_topology.numHWThreads; i++)
-    {
-        fprintf(OUTSTREAM, "%d\t\t%u\t\t%u\t\t%u\n",i
-                ,cpuid_topology.threadPool[i].threadId
-                ,cpuid_topology.threadPool[i].coreId
-                ,cpuid_topology.threadPool[i].packageId);
-    }
-    fprintf(OUTSTREAM, HLINE);
-
-    socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-    while (socketNode != NULL)
-    {
-        fprintf(OUTSTREAM, "Socket %d: ( ",socketNode->id);
-        coreNode = tree_getChildNode(socketNode);
-
-        while (coreNode != NULL)
-        {
-            threadNode = tree_getChildNode(coreNode);
-
-            while (threadNode != NULL)
-            {
-                fprintf(OUTSTREAM, "%d ",threadNode->id);
-                threadNode = tree_getNextNode(threadNode);
-            }
-            coreNode = tree_getNextNode(coreNode);
-        }
-        socketNode = tree_getNextNode(socketNode);
-        fprintf(OUTSTREAM, ")\n");
-    }
-    fprintf(OUTSTREAM, HLINE"\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  Cache Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Cache Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-
-    for ( uint32_t i=0; i <  cpuid_topology.numCacheLevels; i++)
-    {
-        if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
-        {
-            fprintf(OUTSTREAM, "Level:\t%d\n",cpuid_topology.cacheLevels[i].level);
-            if (cpuid_topology.cacheLevels[i].size < 1048576)
-            {
-                fprintf(OUTSTREAM, "Size:\t%d kB\n",
-                        cpuid_topology.cacheLevels[i].size/1024);
-            }
-            else
-            {
-                fprintf(OUTSTREAM, "Size:\t%d MB\n",
-                        cpuid_topology.cacheLevels[i].size/1048576);
-            }
-
-            if( optCaches)
-            {
-                switch (cpuid_topology.cacheLevels[i].type) {
-                    case DATACACHE:
-                        fprintf(OUTSTREAM, "Type:\tData cache\n");
-                        break;
-
-                    case INSTRUCTIONCACHE:
-                        fprintf(OUTSTREAM, "Type:\tInstruction cache\n");
-                        break;
-
-                    case UNIFIEDCACHE:
-                        fprintf(OUTSTREAM, "Type:\tUnified cache\n");
-                        break;
-                    default:
-                        /* make the compiler happy */
-                        break;
-                }
-                fprintf(OUTSTREAM, "Associativity:\t%d\n",
-                        cpuid_topology.cacheLevels[i].associativity);
-                fprintf(OUTSTREAM, "Number of sets:\t%d\n",
-                        cpuid_topology.cacheLevels[i].sets);
-                fprintf(OUTSTREAM, "Cache line size:\t%d\n",
-                        cpuid_topology.cacheLevels[i].lineSize);
-                if(cpuid_topology.cacheLevels[i].inclusive)
-                {
-                    fprintf(OUTSTREAM, "Non Inclusive cache\n");
-                }
-                else
-                {
-                    fprintf(OUTSTREAM, "Inclusive cache\n");
-                }
-                fprintf(OUTSTREAM, "Shared among %d threads\n",
-                        cpuid_topology.cacheLevels[i].threads);
-            }
-            fprintf(OUTSTREAM, "Cache groups:\t");
-            tmp = cpuid_topology.cacheLevels[i].threads;
-            socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-            fprintf(OUTSTREAM, "( ");
-            while (socketNode != NULL)
-            {
-                coreNode = tree_getChildNode(socketNode);
-
-                while (coreNode != NULL)
-                {
-                    threadNode = tree_getChildNode(coreNode);
-
-                    while (threadNode != NULL)
-                    {
-
-                        if (tmp)
-                        {
-                            fprintf(OUTSTREAM, "%d ",threadNode->id);
-                            tmp--;
-                        }
-                        else
-                        {
-                            fprintf(OUTSTREAM, ") ( %d ",threadNode->id);
-                            tmp = cpuid_topology.cacheLevels[i].threads;
-                            tmp--;
-                        }
-
-                        threadNode = tree_getNextNode(threadNode);
-                    }
-                    coreNode = tree_getNextNode(coreNode);
-                }
-                socketNode = tree_getNextNode(socketNode);
-            }
-            fprintf(OUTSTREAM, ")\n");
-
-            fprintf(OUTSTREAM, HLINE);
-        }
-    }
-
-    fprintf(OUTSTREAM, "\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  NUMA Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "NUMA Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-
-    if (numa_init() < 0)
-    {
-        fprintf(OUTSTREAM, "NUMA is not supported on this node!\n");
-    }
-    else
-    {
-        fprintf(OUTSTREAM, "NUMA domains: %d \n",numa_info.numberOfNodes);
-        fprintf(OUTSTREAM, HLINE);
-
-        for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++)
-        {
-            fprintf(OUTSTREAM, "Domain %d:\n", i);
-            fprintf(OUTSTREAM, "Processors: ");
-
-            for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++)
-            {
-                fprintf(OUTSTREAM, " %d",numa_info.nodes[i].processors[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-
-            fprintf(OUTSTREAM, "Relative distance to nodes: ");
-
-            for ( int j = 0; j < numa_info.nodes[i].numberOfDistances; j++)
-            {
-                fprintf(OUTSTREAM, " %d",numa_info.nodes[i].distances[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-
-            fprintf(OUTSTREAM, "Memory: %g MB free of total %g MB\n",
-                    numa_info.nodes[i].freeMemory/1024.0, numa_info.nodes[i].totalMemory/1024.0);
-            fprintf(OUTSTREAM, HLINE);
-        }
-    }
-    fprintf(OUTSTREAM, "\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  Graphical topology
-     *----------------------------------------------------------------------*/
-    if(optGraphical)
-    {
-        int j;
-        bstring  boxLabel = bfromcstr("0");
-
-        fprintf(OUTSTREAM, SLINE);
-        fprintf(OUTSTREAM, "Graphical:\n");
-        fprintf(OUTSTREAM, SLINE);
-
-        /* Allocate without instruction cache */
-        if ( cpuid_info.family == P6_FAMILY || cpuid_info.family == MIC_FAMILY )
-        {
-            container = asciiBoxes_allocateContainer(
-                    cpuid_topology.numCacheLevels,
-                    cpuid_topology.numCoresPerSocket);
-        }
-        else
-        {
-            container = asciiBoxes_allocateContainer(
-                    cpuid_topology.numCacheLevels+1,
-                    cpuid_topology.numCoresPerSocket);
-        }
-
-        socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-        while (socketNode != NULL)
-        {
-            fprintf(OUTSTREAM, "Socket %d:\n",socketNode->id);
-            j=0;
-            coreNode = tree_getChildNode(socketNode);
-
-            /* add threads */
-            while (coreNode != NULL)
-            {
-                threadNode = tree_getChildNode(coreNode);
-                tmp =0;
-
-                while (threadNode != NULL)
-                {
-                    if (tmp > 0)
-                    {
-                        bformata(boxLabel,"  %d", threadNode->id);
-                    }
-                    else
-                    {
-                        boxLabel = bformat("%d",threadNode->id);
-                    }
-                    tmp++;
-                    threadNode = tree_getNextNode(threadNode);
-                }
-                asciiBoxes_addBox(container, 0, j, boxLabel);
-                j++;
-                coreNode = tree_getNextNode(coreNode);
-            }
-
-            /* add caches */
-            {
-                int columnCursor=0;
-                int lineCursor=1;
-                uint32_t sharedCores;
-                int numCachesPerLevel;
-                int cacheWidth;
-
-                for ( uint32_t i=0; i < cpuid_topology.numCacheLevels; i++ )
-                {
-                    sharedCores = cpuid_topology.cacheLevels[i].threads /
-                        cpuid_topology.numThreadsPerCore;
-
-                    if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
-                    {
-                        if ( sharedCores > cpuid_topology.numCoresPerSocket )
-                        {
-                            numCachesPerLevel = 1;
-                        }
-                        else
-                        {
-                            numCachesPerLevel =
-                                cpuid_topology.numCoresPerSocket/sharedCores;
-                        }
-
-                        columnCursor=0;
-                        for ( j=0; j < numCachesPerLevel; j++ )
-                        {
-                            if (cpuid_topology.cacheLevels[i].size < 1048576)
-                            {
-                                boxLabel = bformat("%dkB",
-                                        cpuid_topology.cacheLevels[i].size/1024);
-                            }
-                            else
-                            {
-                                boxLabel = bformat("%dMB",
-                                        cpuid_topology.cacheLevels[i].size/1048576);
-                            }
-
-                            if (sharedCores > 1)
-                            {
-                                if (sharedCores > cpuid_topology.numCoresPerSocket)
-                                {
-                                    cacheWidth = cpuid_topology.numCoresPerSocket-1;
-                                }
-                                else
-                                {
-                                    cacheWidth = sharedCores-1;
-                                }
-                                asciiBoxes_addJoinedBox(
-                                        container,
-                                        lineCursor,
-                                        columnCursor,
-                                        columnCursor+cacheWidth,
-                                        boxLabel);
-
-                                columnCursor += sharedCores;
-                            }
-                            else
-                            {
-                                asciiBoxes_addBox(
-                                        container,
-                                        lineCursor,
-                                        columnCursor,
-                                        boxLabel);
-
-                                columnCursor++;
-                            }
-
-                        }
-                        lineCursor++;
-                    }
-                }
-            }
-
-            asciiBoxes_print(OUTSTREAM, container);
-            socketNode = tree_getNextNode(socketNode);
-        }
-        bdestroy(boxLabel);
-    }
-
-    fflush(OUTSTREAM);
-
-    /* call filterscript if specified */
-    if (!biseqcstr(filterScript,"NO"))
-    {
-        struct bstrList* tokens;
-        tokens = bsplit(filterScript,' ');
-        if (access(bdata(tokens->entry[0]), F_OK))
-        {
-            fprintf(stderr, "Cannot find filter %s!\n", bdata(tokens->entry[0]));
-            bstrListDestroy(tokens);
-            exit(EXIT_FAILURE);
-        }
-        if (access(bdata(tokens->entry[0]), X_OK))
-        {
-            fprintf(stderr, "Cannot execute filter %s!\n", bdata(tokens->entry[0]));
-            bstrListDestroy(tokens);
-            exit(EXIT_FAILURE);
-        }
-        bstrListDestroy(tokens);
-        bcatcstr(filterScript, " topology");
-
-        if (system(bdata(filterScript)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-topology.lua b/src/applications/likwid-topology.lua
new file mode 100644
index 0000000..0123f65
--- /dev/null
+++ b/src/applications/likwid-topology.lua
@@ -0,0 +1,394 @@
+#!<INSTALLED_BINPREFIX>/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-topology.lua
+ *
+ *      Description:  A application to determine the thread and cache topology
+ *                    on x86 processors.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<INSTALLED_PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+stdout_print = print
+
+function version()
+    print(string.format("likwid-topology --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to print the thread and cache topology on x86 CPUs.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Set verbosity")
+    print("-c, --caches\t\t List cache information")
+    print("-C, --clock\t\t Measure processor clock")
+    print("-O\t\t\t CSV output")
+    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter)")
+    print("-g\t\t\t Graphical output")
+end
+
+print_caches = false
+print_graphical = false
+measure_clock = false
+outfile = nil
+output_csv = {}
+
+for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","version","verbose:","clock","caches","output:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        if tonumber(arg) >= 0 and tonumber(arg) <=3 then
+            likwid.setVerbosity(tonumber(arg))
+        else
+            print("Verbosity level not valid. Must be between 0 (only errors) and 3 (developer output)")
+        end
+    elseif opt == "c" or opt == "caches" then
+        print_caches = true
+    elseif opt == "C" or opt == "clock" then
+        measure_clock = true
+    elseif opt == "g" then
+        print_graphical = true
+    elseif opt == "O" then
+        print_csv = true
+    elseif opt == "o" or opt == "output" then
+        local suffix = ""
+        if string.match(arg, "%.") then
+            suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+        end
+        if suffix ~= "txt" then
+            print_csv = true
+        end
+        outfile = arg:gsub("%%h", likwid.gethostname())
+        io.output(arg..".tmp")
+        print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    elseif opt == "!" then
+        print("Option requires an argument")
+        os.exit(1)
+    end
+end
+
+local config = likwid.getConfiguration()
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+
+
+table.insert(output_csv, likwid.hline)
+local lines = 3
+if measure_clock then
+    lines = 4
+end
+table.insert(output_csv, "STRUCT,Info,"..tostring(lines))
+table.insert(output_csv, string.format("CPU name:\t%s",cpuinfo["osname"]))
+table.insert(output_csv, string.format("CPU type:\t%s",cpuinfo["name"]))
+table.insert(output_csv, string.format("CPU stepping:\t%s",cpuinfo["stepping"]))
+if (measure_clock) then
+    if cpuinfo["clock"] == 0 then
+        table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", likwid.getCpuClock() * 1.E-09))
+    else
+        table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", cpuinfo["clock"] * 1.E-09))
+    end
+end
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "STRUCT,Hardware Thread Topology,3")
+table.insert(output_csv, "Hardware Thread Topology")
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, string.format("Sockets:\t\t%u",cputopo["numSockets"]))
+table.insert(output_csv, string.format("Cores per socket:\t%u",cputopo["numCoresPerSocket"]))
+table.insert(output_csv, string.format("Threads per core:\t%u",cputopo["numThreadsPerCore"]))
+table.insert(output_csv, likwid.hline)
+table.insert(output_csv, "TABLE,Topology,"..tostring(cputopo["numHWThreads"]))
+table.insert(output_csv, "HWThread\tThread\t\tCore\t\tSocket\t\tAvailable")
+
+for cntr=0,cputopo["numHWThreads"]-1 do
+    if cputopo["threadPool"][cntr]["inCpuSet"] then
+        table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u\t\t*",cntr,
+                            cputopo["threadPool"][cntr]["threadId"],
+                            cputopo["threadPool"][cntr]["coreId"],
+                            cputopo["threadPool"][cntr]["packageId"]))
+    else
+        table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u",cntr,
+                            cputopo["threadPool"][cntr]["threadId"],
+                            cputopo["threadPool"][cntr]["coreId"],
+                            cputopo["threadPool"][cntr]["packageId"]))
+    end
+end
+table.insert(output_csv, likwid.hline)
+
+table.insert(output_csv, "STRUCT,Sockets,"..tostring(cputopo["numSockets"]))
+for socket=0,cputopo["numSockets"]-1 do
+    csv_str = string.format("Socket %d:\t\t( ",cputopo["topologyTree"][socket]["ID"])
+    for core=0,cputopo["numCoresPerSocket"]-1 do
+        for thread=0, cputopo["numThreadsPerCore"]-1 do
+            csv_str = csv_str ..tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. ","
+        end
+    end
+    table.insert(output_csv, csv_str:sub(1,#csv_str-1).." )")
+end
+
+table.insert(output_csv, likwid.hline)
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "Cache Topology")
+table.insert(output_csv, likwid.sline)
+
+for level=1,cputopo["numCacheLevels"] do
+    if (cputopo["cacheLevels"][level]["type"] ~= "INSTRUCTIONCACHE") then
+        lines = 3
+        if print_caches then lines = 9 end
+        table.insert(output_csv, string.format("STRUCT,Cache Topology L%d,%d", cputopo["cacheLevels"][level]["level"],lines))
+        table.insert(output_csv, string.format("Level:\t\t\t%d",cputopo["cacheLevels"][level]["level"]))
+        if (cputopo["cacheLevels"][level]["size"] < 1048576) then
+            table.insert(output_csv, string.format("Size:\t\t\t%.0f kB",cputopo["cacheLevels"][level]["size"]/1024))
+        else
+            table.insert(output_csv, string.format("Size:\t\t\t%.0f MB",cputopo["cacheLevels"][level]["size"]/1048576))
+        end
+        
+        if (print_caches) then
+            if (cputopo["cacheLevels"][level]["type"] == "DATACACHE") then
+                table.insert(output_csv, "Type:\t\t\tData cache")
+            elseif (cputopo["cacheLevels"][level]["type"] == "UNIFIEDCACHE") then
+                table.insert(output_csv, "Type:\t\t\tUnified cache")
+            end
+
+            table.insert(output_csv, string.format("Associativity:\t\t%d",cputopo["cacheLevels"][level]["associativity"]))
+            table.insert(output_csv, string.format("Number of sets:\t\t%d",cputopo["cacheLevels"][level]["sets"]))
+            table.insert(output_csv, string.format("Cache line size:\t%d",cputopo["cacheLevels"][level]["lineSize"]))
+            
+            if (cputopo["cacheLevels"][level]["inclusive"] == 0) then
+                table.insert(output_csv, "Cache type:\t\tNon Inclusive")
+            else
+                table.insert(output_csv, "Cache type:\t\tInclusive")
+            end
+            table.insert(output_csv, string.format("Shared by threads:\t%d",cputopo["cacheLevels"][level]["threads"]))
+        end
+        local threads = cputopo["cacheLevels"][level]["threads"]
+        str = "Cache groups:\t\t( "
+        for socket=0,cputopo["numSockets"]-1 do
+            for core=0,cputopo["numCoresPerSocket"]-1 do
+                for cpu=0,cputopo["numThreadsPerCore"]-1 do
+                    if (threads ~= 0) then
+                        str = str .. cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu] .. " "
+                        threads = threads - 1
+                    else
+                        str = str .. string.format(") ( %d ",cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu])
+                        threads = cputopo["cacheLevels"][level]["threads"]
+                        threads = threads - 1
+                    end
+                end
+            end
+        end
+        str = str .. ")"
+        table.insert(output_csv, str)
+        table.insert(output_csv, likwid.hline)
+    end
+end
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "NUMA Topology")
+table.insert(output_csv, likwid.sline)
+
+if (numainfo["numberOfNodes"] == 0) then
+    table.insert(output_csv, "No NUMA")
+else
+    table.insert(output_csv, string.format("NUMA domains:\t\t%d",numainfo["numberOfNodes"]))
+    table.insert(output_csv, likwid.hline)
+    for node=1,numainfo["numberOfNodes"] do
+        table.insert(output_csv, string.format("STRUCT,NUMA Topology %d,5",numainfo["nodes"][node]["id"]))
+        table.insert(output_csv, string.format("Domain:\t\t\t%d",numainfo["nodes"][node]["id"]))
+        csv_str = "Processors:\t\t( "
+        for cpu=1,numainfo["nodes"][node]["numberOfProcessors"] do
+            csv_str = csv_str .. numainfo["nodes"][node]["processors"][cpu] .. ","
+        end
+        table.insert(output_csv, csv_str:sub(1,#csv_str-1).. " )")
+        csv_str = "Distances:\t\t"
+        for cpu=1,numainfo["nodes"][node]["numberOfDistances"] do
+            csv_str = csv_str .. numainfo["nodes"][node]["distances"][cpu][cpu-1] .. ","
+        end
+        table.insert(output_csv, csv_str:sub(1,#csv_str-1))
+        table.insert(output_csv, string.format("Free memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["freeMemory"]/1024.0)))
+        table.insert(output_csv, string.format("Total memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["totalMemory"]/1024.0)))
+        table.insert(output_csv, likwid.hline)
+    end
+end
+
+
+
+if print_csv then
+    longest_line = 0
+    local tmpList = {}
+    for i=#output_csv,1,-1 do
+        output_csv[i] = output_csv[i]:gsub("[\t]+",",")
+        output_csv[i] = output_csv[i]:gsub("%( ","")
+        output_csv[i] = output_csv[i]:gsub(" %)[%s]*",",")
+        output_csv[i] = output_csv[i]:gsub(",$","")
+        if  output_csv[i]:sub(1,1) == "*" or
+            output_csv[i]:sub(1,1) == "-" or
+            output_csv[i]:match("^Hardware Thread Topology") or
+            output_csv[i]:match("^Cache Topology") or
+            output_csv[i]:match("^NUMA Topology") then
+            table.remove(output_csv,i)
+        end
+        tmpList = likwid.stringsplit(output_csv[i],",")
+        if #tmpList > longest_line then longest_line = #tmpList end
+    end
+    for i=1,#output_csv do
+        tmpList = likwid.stringsplit(output_csv[i],",")
+        if #tmpList < longest_line then
+            output_csv[i] = output_csv[i]..string.rep(",",longest_line-#tmpList)
+        end
+    end
+else
+    for i=#output_csv,1,-1 do
+        output_csv[i] = output_csv[i]:gsub(","," ")
+        if output_csv[i]:match("^TABLE") or
+           output_csv[i]:match("^STRUCT") then
+            table.remove(output_csv,i)
+        end
+    end
+end
+
+for _,line in pairs(output_csv) do print(line) end
+
+if print_graphical and not print_csv then
+    print("\n")
+    print(likwid.sline)
+    print("Graphical Topology")
+    print(likwid.sline)
+    for socket=0,cputopo["numSockets"]-1 do
+        print(string.format("Socket %d:",cputopo["topologyTree"][socket]["ID"]))
+        container = {}
+        for core=0,cputopo["numCoresPerSocket"]-1 do
+            local tmpString = ""
+            for thread=0,cputopo["numThreadsPerCore"]-1 do
+                if thread == 0 then
+                    tmpString = tmpString .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread])
+                else
+                    tmpString = tmpString .. " " .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. " "
+                end
+            end
+            likwid.addSimpleAsciiBox(container, 1, core+1, tmpString)
+        end
+        
+        local columnCursor = 1
+        local lineCursor = 2
+        for cache=1,cputopo["numCacheLevels"] do
+            if cputopo["cacheLevels"][cache]["type"] ~= "INSTRUCTIONCACHE" then
+                local cachesAtCurLevel = 0
+                local sharedCores = cputopo["cacheLevels"][cache]["threads"]/cputopo["numThreadsPerCore"]
+                if sharedCores >= cputopo["numCoresPerSocket"] then
+                    cachesAtCurLevel = 1
+                else
+                    cachesAtCurLevel = cputopo["numCoresPerSocket"]/sharedCores
+                end
+                columnCursor = 1
+                for cachesAtLevel=1,cachesAtCurLevel do
+                    local tmpString = ""
+                    local cacheWidth = 0
+                    if cputopo["cacheLevels"][cache]["size"] < 1048576 then
+                        tmpString = string.format("%dkB", cputopo["cacheLevels"][cache]["size"]/1024)
+                    else
+                        tmpString = string.format("%dMB", cputopo["cacheLevels"][cache]["size"]/1048576)
+                    end
+                    if sharedCores > 1 then
+                        if sharedCores > cputopo["numCoresPerSocket"] then
+                            cacheWidth = sharedCores
+                        else
+                            cacheWidth = sharedCores - 1
+                        end
+                        likwid.addJoinedAsciiBox(container, lineCursor, columnCursor,columnCursor + cacheWidth, tmpString)
+                        columnCursor = columnCursor + cacheWidth
+                    else
+                        likwid.addSimpleAsciiBox(container, lineCursor, columnCursor, tmpString)
+                        columnCursor = columnCursor + 1
+                    end
+                end
+                lineCursor = lineCursor + 1
+            end
+        end
+        likwid.printAsciiBox(container);
+    end
+end
+
+if outfile then
+    local suffix = ""
+    if string.match(outfile, "%.") then
+        suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+    end
+    local command = "<INSTALLED_PREFIX>/share/likwid/filter/" .. suffix
+    local tmpfile = outfile..".tmp"
+    if suffix == "" then
+        os.rename(tmpfile, outfile)
+    elseif suffix ~= "txt" and suffix ~= "csv" and likwid.access(command,"x") then
+        stdout_print("Cannot find filter script, save output in CSV format to file "..outfile)
+        os.rename(tmpfile, outfile)
+    else
+        if suffix ~= "txt" and suffix ~= "csv" then
+            command = command .." ".. tmpfile .. " topology"
+            local f = assert(io.popen(command))
+            if f ~= nil then
+                local o = f:read("*a")
+                if o:len() > 0 then
+                    stdout_print(string.format("Failed to executed filter script %s.",command))
+                end
+            else
+                stdout_print("Failed to call filter script, save output in CSV format to file "..outfile)
+                os.rename(tmpfile, outfile)
+                os.remove(tmpfile)
+            end
+        else
+            os.rename(tmpfile, outfile)
+            os.remove(tmpfile)
+        end
+    end
+end
+
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua
new file mode 100644
index 0000000..a6ffee3
--- /dev/null
+++ b/src/applications/likwid.lua
@@ -0,0 +1,1142 @@
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid.lua
+ *
+ *      Description:  Lua LIKWID interface library
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+local likwid = {}
+package.cpath = '<INSTALLED_LIBPREFIX>/?.so;' .. package.cpath
+require("liblikwid")
+require("math")
+
+likwid.groupfolder = "<LIKWIDGROUPPATH>"
+
+likwid.version = <VERSION>
+likwid.release = <RELEASE>
+likwid.pinlibpath = "<LIBLIKWIDPIN>"
+likwid.dline = string.rep("=",80)
+likwid.hline =  string.rep("-",80)
+likwid.sline = string.rep("*",80)
+
+
+
+likwid.getConfiguration = likwid_getConfiguration
+likwid.setGroupPath = likwid_setGroupPath
+likwid.putConfiguration = likwid_putConfiguration
+likwid.setAccessClientMode = likwid_setAccessClientMode
+likwid.init = likwid_init
+likwid.addEventSet = likwid_addEventSet
+likwid.setupCounters = likwid_setupCounters
+likwid.startCounters = likwid_startCounters
+likwid.stopCounters = likwid_stopCounters
+likwid.readCounters = likwid_readCounters
+likwid.switchGroup = likwid_switchGroup
+likwid.finalize = likwid_finalize
+likwid.getEventsAndCounters = likwid_getEventsAndCounters
+likwid.getResult = likwid_getResult
+likwid.getLastResult = likwid_getLastResult
+likwid.getMetric = likwid_getMetric
+likwid.getLastMetric = likwid_getLastMetric
+likwid.getNumberOfGroups = likwid_getNumberOfGroups
+likwid.getRuntimeOfGroup = likwid_getRuntimeOfGroup
+likwid.getIdOfActiveGroup = likwid_getIdOfActiveGroup
+likwid.getNumberOfEvents = likwid_getNumberOfEvents
+likwid.getNumberOfThreads = likwid_getNumberOfThreads
+likwid.getNumberOfMetrics = likwid_getNumberOfMetrics
+likwid.getNameOfMetric = likwid_getNameOfMetric
+likwid.getNameOfEvent = likwid_getNameOfEvent
+likwid.getNameOfCounter = likwid_getNameOfCounter
+likwid.getNameOfGroup = likwid_getNameOfGroup
+likwid.getGroups = likwid_getGroups
+likwid.getShortInfoOfGroup = likwid_getShortInfoOfGroup
+likwid.getLongInfoOfGroup = likwid_getLongInfoOfGroup
+likwid.getCpuInfo = likwid_getCpuInfo
+likwid.getCpuTopology = likwid_getCpuTopology
+likwid.putTopology = likwid_putTopology
+likwid.getNumaInfo = likwid_getNumaInfo
+likwid.putNumaInfo = likwid_putNumaInfo
+likwid.setMemInterleaved = likwid_setMemInterleaved
+likwid.getAffinityInfo = likwid_getAffinityInfo
+likwid.putAffinityInfo = likwid_putAffinityInfo
+likwid.getPowerInfo = likwid_getPowerInfo
+likwid.putPowerInfo = likwid_putPowerInfo
+likwid.getOnlineDevices = likwid_getOnlineDevices
+likwid.printSupportedCPUs = likwid_printSupportedCPUs
+likwid.getCpuClock = likwid_getCpuClock
+likwid.getCycleClock = likwid_getCycleClock
+likwid.startClock = likwid_startClock
+likwid.stopClock = likwid_stopClock
+likwid.getClockCycles = likwid_getClockCycles
+likwid.getClock = likwid_getClock
+likwid.sleep = sleep
+likwid.startPower = likwid_startPower
+likwid.stopPower = likwid_stopPower
+likwid.calcPower = likwid_printEnergy
+likwid.getPowerLimit = likwid_powerLimitGet
+likwid.setPowerLimit = likwid_powerLimitSet
+likwid.statePowerLimit = likwid_powerLimitState
+likwid.initTemp = likwid_initTemp
+likwid.readTemp = likwid_readTemp
+likwid.memSweep = likwid_memSweep
+likwid.memSweepDomain = likwid_memSweepDomain
+likwid.pinProcess = likwid_pinProcess
+likwid.setenv = likwid_setenv
+likwid.getpid = likwid_getpid
+likwid.setVerbosity = likwid_setVerbosity
+likwid.access = likwid_access
+likwid.startProgram = likwid_startProgram
+likwid.checkProgram = likwid_checkProgram
+likwid.killProgram = likwid_killProgram
+likwid.catchSignal = likwid_catchSignal
+likwid.getSignalState = likwid_getSignalState
+likwid.waitpid = likwid_waitwid
+likwid.cpustr_to_cpulist = likwid_cpustr_to_cpulist
+likwid.nodestr_to_nodelist = likwid_nodestr_to_nodelist
+likwid.sockstr_to_socklist = likwid_sockstr_to_socklist
+likwid.markerInit = likwid_markerInit
+likwid.markerThreadInit = likwid_markerThreadInit
+likwid.markerClose = likwid_markerClose
+likwid.markerNextGroup = likwid_markerNextGroup
+likwid.registerRegion = likwid_registerRegion
+likwid.startRegion = likwid_startRegion
+likwid.stopRegion = likwid_stopRegion
+likwid.getRegion = likwid_getRegion
+likwid.initCpuFeatures = likwid_cpuFeaturesInit
+likwid.getCpuFeatures = likwid_cpuFeaturesGet
+likwid.enableCpuFeatures = likwid_cpuFeaturesEnable
+likwid.disableCpuFeatures = likwid_cpuFeaturesDisable
+likwid.readMarkerFile = likwid_readMarkerFile
+likwid.destroyMarkerFile = likwid_destroyMarkerFile
+likwid.markerNumRegions = likwid_markerNumRegions
+likwid.markerRegionGroup = likwid_markerRegionGroup
+likwid.markerRegionTag = likwid_markerRegionTag
+likwid.markerRegionEvents = likwid_markerRegionEvents
+likwid.markerRegionCpulist = likwid_markerRegionCpulist
+likwid.markerRegionThreads = likwid_markerRegionThreads
+likwid.markerRegionTime = likwid_markerRegionTime
+likwid.markerRegionCount = likwid_markerRegionCount
+likwid.markerRegionResult = likwid_markerRegionResult
+likwid.markerRegionMetric = likwid_markerRegionMetric
+
+likwid.cpuFeatures = { [0]="HW_PREFETCHER", [1]="CL_PREFETCHER", [2]="DCU_PREFETCHER", [3]="IP_PREFETCHER",
+                        [4]="FAST_STRINGS", [5]="THERMAL_CONTROL", [6]="PERF_MON", [7]="FERR_MULTIPLEX",
+                        [8]="BRANCH_TRACE_STORAGE", [9]="XTPR_MESSAGE", [10]="PEBS", [11]="SPEEDSTEP",
+                        [12]="MONITOR", [13]="SPEEDSTEP_LOCK", [14]="CPUID_MAX_VAL", [15]="XD_BIT",
+                        [16]="DYN_ACCEL", [17]="TURBO_MODE", [18]="TM2" }
+
+infinity = math.huge
+
+
+local function getopt(args, ostrlist)
+    local arg, place,placeend = nil, 0, 0;
+    return function ()
+        if place == 0 then -- update scanning pointer
+            place = 1
+            if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+            if #args[1] >= 2 then
+                if args[1]:sub(2, 2) == '-' then
+                    if #args[1] == 2 then -- found "--"
+                        place = 0
+                        table.remove(args, 1)
+                        return args[1], nil
+                    end
+                    place = place + 1
+                end
+                if args[1]:sub(3, 3) == '-' then
+                    place = 0
+                    table.remove(args, 1)
+                    return args[1], nil
+                end
+                place = place + 1
+                placeend = #args[1]
+            end
+        end
+        local optopt = args[1]:sub(place, placeend)
+        place = place + 1;
+        local givopt = ""
+        local needarg = false
+        for _, ostr in pairs(ostrlist) do
+            local matchstring = "^"..ostr.."$"
+            placeend = place + #ostr -1
+            if ostr:sub(#ostr,#ostr) == ":" then
+                matchstring = "^"..ostr:sub(1,#ostr-1).."$"
+                needarg = true
+                placeend = place + #ostr -2
+            end
+            if optopt:match(matchstring) then
+                givopt = ostr
+                break
+            end
+            needarg = false
+        end
+        if givopt == "" then -- unknown option
+            if optopt == '-' then return nil end
+            if place > #args[1] then
+                table.remove(args, 1)
+                place = 0;
+            end
+            return '?',  optopt;
+        end
+
+        if not needarg then -- do not need argument
+            arg = true;
+            table.remove(args, 1)
+            place = 0;
+        else -- need an argument
+            if placeend < #args[1] then -- no white space
+                arg = args[1]:sub(placeend,#args[1])
+            else
+                table.remove(args, 1);
+                if #args == 0 then -- an option requiring argument is the last one
+                    place = 0
+                    if givopt:sub(placeend, placeend) == ':' then return ':' end
+                    return '!', optopt
+                else arg = args[1] end
+            end
+            table.remove(args, 1)
+            place = 0;
+        end
+        return optopt, arg
+    end
+end
+
+
+likwid.getopt = getopt
+
+local function tablelength(T)
+    local count = 0
+    if T == nil then return count end
+    if type(T) ~= "table" then return count end
+    for _ in pairs(T) do count = count + 1 end
+    return count
+end
+
+likwid.tablelength = tablelength
+
+local function tableprint(T, long)
+    if T == nil or type(T) ~= "table" or tablelength(T) == 0 then
+        print("[]")
+        return
+    end
+    local start_index = 0
+    local end_index = #T
+    if T[start_index] == nil then
+        start_index = 1
+        end_index = #T
+    end
+    outstr = ""
+    if T[start_index] ~= nil then
+        for i=start_index,end_index do
+            if not long then
+                outstr = outstr .. "," .. tostring(T[i])
+            else
+                outstr = outstr .. "," .. "[" .. tostring(i) .. "] = ".. tostring(T[i])
+            end
+        end
+    else
+        for k,v in pairs(T) do
+            if not long then
+                outstr = outstr .. "," .. tostring(v)
+            else
+                outstr = outstr .. "," .. "[" .. tostring(k) .. "] = ".. tostring(v)
+            end
+        end
+    end
+    print("["..outstr:sub(2,outstr:len()).."]")
+end
+
+likwid.tableprint = tableprint
+
+local function get_spaces(str, min_space, max_space)
+    local length = str:len()
+    local back = 0
+    local front = 0
+    back = math.ceil((max_space-str:len()) /2)
+    front = max_space - back - str:len()
+
+    if (front < back) then
+        local tmp = front
+        front = back
+        back = tmp
+    end
+    return string.rep(" ", front),string.rep(" ", back)
+end
+
+local function calculate_metric(formula, counters_to_values)
+    local function cmp(a,b)
+        if a:len() > b:len() then return true end
+        return false
+    end
+    local result = "Nan"
+    local err = false
+    local clist = {}
+    for counter,value in pairs(counters_to_values) do
+        table.insert(clist, counter)
+    end
+    table.sort(clist, cmp)
+    for _,counter in pairs(clist) do
+        formula = string.gsub(formula, tostring(counter), tostring(counters_to_values[counter]))
+    end
+    for c in formula:gmatch"." do
+        if c ~= "+" and c ~= "-" and  c ~= "*" and  c ~= "/" and c ~= "(" and c ~= ")" and c ~= "." and c:lower() ~= "e" then
+            local tmp = tonumber(c)
+            if type(tmp) ~= "number" then
+                print("Not all formula entries can be substituted with measured values")
+                print("Current formula: "..formula)
+                err = true
+                break
+            end
+        end
+    end
+    if not err then
+        if formula then
+            result = assert(load("return (" .. formula .. ")")())
+            if (result == nil or result ~= result or result == infinity or result == -infinity) then
+                result = 0
+            end
+        else
+            result = 0
+        end
+    end
+    return result
+end
+
+likwid.calculate_metric = calculate_metric
+
+local function printtable(tab)
+    local nr_columns = tablelength(tab)
+    if nr_columns == 0 then
+        print("Table has no columns. Empty table?")
+        return
+    end
+    local nr_lines = tablelength(tab[1])
+    local min_lengths = {}
+    local max_lengths = {}
+    for i, col in pairs(tab) do
+        if tablelength(col) ~= nr_lines then
+            print("Not all columns have the same row count, nr_lines"..tostring(nr_lines)..", current "..tablelength(col))
+            return
+        end
+        if min_lengths[i] == nil then
+            min_lengths[i] = 10000000
+            max_lengths[i] = 0
+        end
+        for j, field in pairs(col) do
+            if tostring(field):len() > max_lengths[i] then
+                max_lengths[i] = tostring(field):len()
+            end
+            if tostring(field):len() < min_lengths[i] then
+                min_lengths[i] = tostring(field):len()
+            end
+        end
+    end
+    hline = ""
+    for i=1,#max_lengths do
+        hline = hline .. "+-"..string.rep("-",max_lengths[i]).."-"
+    end
+    hline = hline .. "+"
+    print(hline)
+    
+    str = "| "
+    for i=1,nr_columns do
+        front, back = get_spaces(tostring(tab[i][1]), min_lengths[i],max_lengths[i])
+        str = str .. front.. tostring(tab[i][1]) ..back
+        if i<nr_columns then
+            str = str .. " | "
+        else
+            str = str .. " |"
+        end
+    end
+    print(str)
+    print(hline)
+    
+    for j=2,nr_lines do
+        str = "| "
+        for i=1,nr_columns do
+            front, back = get_spaces(tostring(tab[i][j]), min_lengths[i],max_lengths[i])
+            str = str .. front.. tostring(tab[i][j]) ..back
+            if i<nr_columns then
+                str = str .. " | "
+            else
+                str = str .. " |"
+            end
+        end
+        print(str)
+    end
+    if nr_lines > 1 then
+        print(hline)
+    end
+    print()
+end
+
+likwid.printtable = printtable
+
+local function printcsv(tab, linelength)
+    local nr_columns = tablelength(tab)
+    if nr_columns == 0 then
+        print("Table has no columns. Empty table?")
+        return
+    end
+    local nr_lines = tablelength(tab[1])
+    local str = ""
+    for j=1,nr_lines do
+        str = ""
+        for i=1,nr_columns do
+            str = str .. tostring(tab[i][j])
+            if (i ~= nr_columns) then
+                str = str .. ","
+            end
+        end
+        if nr_columns < linelength then
+            str = str .. string.rep(",", linelength-nr_columns)
+        end
+        print(str)
+    end
+    
+end
+
+likwid.printcsv = printcsv
+
+local function stringsplit(astr, sSeparator, nMax, bRegexp)
+    assert(sSeparator ~= '')
+    assert(nMax == nil or nMax >= 1)
+    if astr == nil then return {} end
+    local aRecord = {}
+
+    if astr:len() > 0 then
+        local bPlain = not bRegexp
+        nMax = nMax or -1
+
+        local nField=1 nStart=1
+        local nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+        while nFirst and nMax ~= 0 do
+            aRecord[nField] = astr:sub(nStart, nFirst-1)
+            nField = nField+1
+            nStart = nLast+1
+            nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+            nMax = nMax-1
+            end
+        aRecord[nField] = astr:sub(nStart)
+    end
+
+    return aRecord
+end
+
+likwid.stringsplit = stringsplit
+
+local function get_groups()
+    groups = {}
+    local cpuinfo = likwid.getCpuInfo()
+    if cpuinfo == nil then return 0, {} end
+    local f = io.popen("ls " .. likwid.groupfolder .. "/" .. cpuinfo["short_name"] .."/*.txt 2>/dev/null")
+    if f ~= nil then
+        t = stringsplit(f:read("*a"),"\n")
+        f:close()
+        for i, a in pairs(t) do
+            if a ~= "" then
+                table.insert(groups,a:sub((a:match'^.*()/')+1,a:len()-4))
+            end
+        end
+    end
+    f = io.popen("ls " ..os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/*.txt 2>/dev/null")
+    if f ~= nil then
+        t = stringsplit(f:read("*a"),"\n")
+        f:close()
+        for i, a in pairs(t) do
+            if a ~= "" then
+                table.insert(groups,a:sub((a:match'^.*()/')+1,a:len()-4))
+            end
+        end
+    end
+    return #groups,groups
+end
+
+likwid.get_groups = get_groups
+
+local function new_groupdata(eventString, fix_ctrs)
+    local gdata = {}
+    local num_events = 1
+    gdata["Events"] = {}
+    gdata["EventString"] = ""
+    gdata["GroupString"] = ""
+    local s,e = eventString:find(":")
+    if s == nil then
+        return gdata
+    end
+    if fix_ctrs > 0 then
+        if not eventString:match("FIXC0") and fix_ctrs >= 1 then
+            eventString = eventString..",INSTR_RETIRED_ANY:FIXC0"
+        end
+        if not eventString:match("FIXC1") and fix_ctrs >= 2 then
+            eventString = eventString..",CPU_CLK_UNHALTED_CORE:FIXC1"
+        end
+        if not eventString:match("FIXC2") and fix_ctrs == 3 then
+            eventString = eventString..",CPU_CLK_UNHALTED_REF:FIXC2"
+        end
+        
+        
+    end
+    gdata["EventString"] = eventString
+    gdata["GroupString"] = eventString
+    local eventslist = likwid.stringsplit(eventString,",")
+    for i,e in pairs(eventslist) do
+        eventlist = likwid.stringsplit(e,":")
+        gdata["Events"][num_events] = {}
+        gdata["Events"][num_events]["Event"] = eventlist[1]
+        gdata["Events"][num_events]["Counter"] = eventlist[2]
+        if #eventlist > 2 then
+            table.remove(eventlist, 2)
+            table.remove(eventlist, 1)
+            gdata["Events"][num_events]["Options"] = eventlist
+        end
+        num_events = num_events + 1
+    end
+    return gdata
+end
+
+
+local function get_groupdata(group)
+    groupdata = {}
+    local group_exist = 0
+    local cpuinfo = likwid.getCpuInfo()
+    if cpuinfo == nil then return nil end
+
+    num_groups, groups = get_groups()
+    for i, a in pairs(groups) do
+        if (a == group) then group_exist = 1 end
+    end
+    if (group_exist == 0) then return new_groupdata(group, cpuinfo["perf_num_fixed_ctr"]) end
+    
+    local f = io.open(likwid.groupfolder .. "/" .. cpuinfo["short_name"] .. "/" .. group .. ".txt", "r")
+    if f == nil then
+        f = io.open(os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/" .. group .. ".txt", "r")
+        if f == nil then
+            print("Cannot read data for group "..group)
+            print("Tried folders:")
+            print(likwid.groupfolder .. "/" .. cpuinfo["short_name"] .. "/" .. group .. ".txt")
+            print(os.getenv("HOME") .. "/.likwid/groups/" .. cpuinfo["short_name"] .."/*.txt")
+            return groupdata
+        end
+    end
+    local t = f:read("*all")
+    f:close()
+    local parse_eventset = false
+    local parse_metrics = false
+    local parse_long = false
+    groupdata["EventString"] = ""
+    groupdata["Events"] = {}
+    groupdata["Metrics"] = {}
+    groupdata["LongDescription"] = ""
+    groupdata["GroupString"] = group
+    nr_events = 1
+    nr_metrics = 1
+    for i, line in pairs(stringsplit(t,"\n")) do
+        
+        if (parse_eventset or parse_metrics or parse_long) and line:len() == 0 then
+            parse_eventset = false
+            parse_metrics = false
+            parse_long = false
+        end
+
+        if line:match("^SHORT%a*") ~= nil then
+            linelist = stringsplit(line, "%s+", nil, "%s+")
+            table.remove(linelist, 1)
+            groupdata["ShortDescription"] = table.concat(linelist, " ")  
+        end
+
+        if line:match("^EVENTSET$") ~= nil then
+            parse_eventset = true
+        end
+
+        if line:match("^METRICS$") ~= nil then
+            parse_metrics = true
+        end
+
+        if line:match("^LONG$") ~= nil then
+            parse_long = true
+        end
+
+        if parse_eventset and line:match("^EVENTSET$") == nil then
+            linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+            eventstring = linelist[2] .. ":" .. linelist[1]
+            if #linelist > 2 then
+                table.remove(linelist,2)
+                table.remove(linelist,1)
+                eventstring = eventstring .. ":".. table.concat(":",linelist)
+            end
+            groupdata["EventString"] = groupdata["EventString"] .. "," .. eventstring
+            groupdata["Events"][nr_events] = {}
+            groupdata["Events"][nr_events]["Event"] = linelist[2]:gsub("^%s*(.-)%s*$", "%1")
+            groupdata["Events"][nr_events]["Counter"] = linelist[1]:gsub("^%s*(.-)%s*$", "%1")
+            nr_events = nr_events + 1
+        end
+        
+        if parse_metrics and line:match("^METRICS$") == nil then
+            linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+            formula = linelist[#linelist]
+            table.remove(linelist)
+            groupdata["Metrics"][nr_metrics] = {}
+            groupdata["Metrics"][nr_metrics]["description"] = table.concat(linelist, " ")
+            groupdata["Metrics"][nr_metrics]["formula"] = formula
+            nr_metrics = nr_metrics + 1
+        end
+        
+        if parse_long and line:match("^LONG$") == nil then
+            groupdata["LongDescription"] = groupdata["LongDescription"] .. "\n" .. line
+        end
+    end
+    groupdata["LongDescription"] = groupdata["LongDescription"]:sub(2)
+    groupdata["EventString"] = groupdata["EventString"]:sub(2)
+    
+    return groupdata
+    
+end
+
+likwid.get_groupdata = get_groupdata
+
+
+
+
+local function parse_time(timestr)
+    local duration = 0
+    local s1,e1 = timestr:find("ms")
+    local s2,e2 = timestr:find("us")
+    if s1 ~= nil then
+        duration = tonumber(timestr:sub(1,s1-1)) * 1.E03
+    elseif s2 ~= nil then
+        duration = tonumber(timestr:sub(1,s2-1))
+    else
+        s1,e1 = timestr:find("s")
+        if s1 == nil then
+            print("Cannot parse time, '" .. timestr .. "' not well formatted, we need a time unit like s, ms, us")
+            os.exit(1)
+        end
+        duration = tonumber(timestr:sub(1,s1-1)) * 1.E06
+    end
+    return duration
+end
+
+likwid.parse_time = parse_time
+
+local function num2str(value)
+    local tmp = "0"
+    if value ~= 0 then
+        if tostring(value):match("%.0$") or value == math.tointeger(value) then
+            tmp = tostring(math.tointeger(value))
+        elseif string.format("%.4f", value):len() < 12 and
+            tonumber(string.format("%.4f", value)) ~= 0 then
+            tmp = string.format("%.4f", value)
+        else
+            tmp = string.format("%e", value)
+        end
+    end
+    return tmp
+end
+
+likwid.num2str = num2str
+
+local function min_max_avg(values)
+    min = math.huge
+    max = 0.0
+    sum = 0.0
+    count = 0
+    for _, value in pairs(values) do
+        if value ~= nil then
+            if (value < min) then min = value end
+            if (value > max) then max = value end
+            sum = sum + value
+            count = count + 1
+        end
+    end
+    return min, max, sum/count
+end
+
+local function tableMinMaxAvgSum(inputtable, skip_cols, skip_lines)
+    local outputtable = {}
+    local nr_columns = #inputtable
+    if nr_columns == 0 then
+        return {}
+    end
+    local nr_lines = #inputtable[1]
+    if nr_lines == 0 then
+        return {}
+    end
+    minOfLine = {"Min"}
+    maxOfLine = {"Max"}
+    sumOfLine = {"Sum"}
+    avgOfLine = {"Avg"}
+    for i=skip_lines+1,nr_lines do
+        minOfLine[i-skip_lines+1] = math.huge
+        maxOfLine[i-skip_lines+1] = 0
+        sumOfLine[i-skip_lines+1] = 0
+        avgOfLine[i-skip_lines+1] = 0
+    end
+    for j=skip_cols+1,nr_columns do
+        for i=skip_lines+1, nr_lines do
+            local res = tonumber(inputtable[j][i])
+            if res ~= nil then
+                minOfLine[i-skip_lines+1] = math.min(res, minOfLine[i-skip_lines+1])
+                maxOfLine[i-skip_lines+1] = math.max(res, maxOfLine[i-skip_lines+1])
+                sumOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1] + res
+            else
+                minOfLine[i-skip_lines+1] = 0
+                maxOfLine[i-skip_lines+1] = 0
+                sumOfLine[i-skip_lines+1] = 0
+            end
+            avgOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1]/(nr_columns-skip_cols)
+        end
+    end
+    for i=2,#minOfLine do
+        minOfLine[i] = likwid.num2str(minOfLine[i])
+        maxOfLine[i] = likwid.num2str(maxOfLine[i])
+        sumOfLine[i] = likwid.num2str(sumOfLine[i])
+        avgOfLine[i] = likwid.num2str(avgOfLine[i])
+    end
+
+    local tmptable = {}
+    table.insert(tmptable, inputtable[1][1])
+    for j=2,#inputtable[1] do
+        table.insert(tmptable, inputtable[1][j].." STAT")
+    end
+    table.insert(outputtable, tmptable)
+    for i=2,skip_cols do
+        local tmptable = {}
+        table.insert(tmptable, inputtable[i][1])
+        for j=2,#inputtable[i] do
+            table.insert(tmptable, inputtable[i][j])
+        end
+        table.insert(outputtable, tmptable)
+    end
+    table.insert(outputtable, sumOfLine)
+    table.insert(outputtable, minOfLine)
+    table.insert(outputtable, maxOfLine)
+    table.insert(outputtable, avgOfLine)
+    return outputtable
+end
+
+likwid.tableToMinMaxAvgSum = tableMinMaxAvgSum
+
+local function printOutput(results, metrics, cpulist, region, stats)
+    local maxLineFields = 0
+    local cpuinfo = likwid_getCpuInfo()
+    local clock = likwid.getCpuClock()
+    local regionName = likwid.markerRegionTag(region)
+    local regionThreads = likwid.markerRegionThreads(region)
+    local cur_cpulist = cpulist
+    if region ~= nil then
+        cur_cpulist = likwid.markerRegionCpulist(region)
+    end
+
+    for g, group in pairs(results) do
+        local infotab = {}
+        local firsttab = {}
+        local firsttab_combined = {}
+        local secondtab = {}
+        local secondtab_combined = {}
+        local runtime = likwid.getRuntimeOfGroup(g)
+        local groupName = likwid.getNameOfGroup(g)
+        if region ~= nil then
+            infotab[1] = {"Region Info","RDTSC Runtime [s]","call count"}
+            for c, cpu in pairs(cur_cpulist) do
+                local tmpList = {}
+                table.insert(tmpList, "Core "..tostring(cpu))
+                table.insert(tmpList, string.format("%.6f", likwid.markerRegionTime(region, c)))
+                table.insert(tmpList, tostring(likwid.markerRegionCount(region, c)))
+                table.insert(infotab, tmpList)
+            end
+        end
+        firsttab[1] = {"Event"}
+        firsttab_combined[1] = {"Event"}
+        firsttab[2] = {"Counter"}
+        firsttab_combined[2] = {"Counter"}
+        if likwid.getNumberOfMetrics(g) == 0 then
+            table.insert(firsttab[1],"Runtime (RDTSC) [s]")
+            table.insert(firsttab[2],"TSC")
+        end
+        for e, event in pairs(group) do
+            eventname = likwid.getNameOfEvent(g, e)
+            countername = likwid.getNameOfCounter(g, e)
+            table.insert(firsttab[1], eventname)
+            table.insert(firsttab[2], countername)
+            table.insert(firsttab_combined[1], eventname .. " STAT")
+            table.insert(firsttab_combined[2], countername)
+        end
+        for c, cpu in pairs(cur_cpulist) do
+            local tmpList = {"Core "..tostring(cpu)}
+            if likwid.getNumberOfMetrics(g) == 0 then
+                if region == nil then
+                    table.insert(tmpList, string.format("%e", runtime))
+                else
+                    table.insert(tmpList, string.format("%e", likwid.markerRegionTime(region, c)))
+                end
+            end
+            
+            for e, event in pairs(group) do
+                local tmp = tostring(likwid.num2str(event[c]))
+                table.insert(tmpList, tmp)
+            end
+            table.insert(firsttab, tmpList)
+        end
+        if #cpulist > 1 or stats == true then
+            firsttab_combined = tableMinMaxAvgSum(firsttab, 2, 1)
+        end
+        if likwid.getNumberOfMetrics(g) > 0 then
+            secondtab[1] = {"Metric"}
+            secondtab_combined[1] = {"Metric"}
+            for m=1, likwid.getNumberOfMetrics(g) do
+                table.insert(secondtab[1], likwid.getNameOfMetric(g, m))
+                table.insert(secondtab_combined[1], likwid.getNameOfMetric(g, m).." STAT" )
+            end
+            for c, cpu in pairs(cur_cpulist) do
+                local tmpList = {"Core "..tostring(cpu)}
+                for m=1, likwid.getNumberOfMetrics(g) do
+                    local tmp = tostring(likwid.num2str(metrics[g][m][c]))
+                    table.insert(tmpList, tmp)
+                end
+                table.insert(secondtab, tmpList)
+            end
+            if #cpulist > 1 or stats == true  then
+                secondtab_combined = tableMinMaxAvgSum(secondtab, 1, 1)
+            end
+        end
+        maxLineFields = math.max(#firsttab, #firsttab_combined,
+                                 #secondtab, #secondtab_combined)
+        if use_csv then
+            print(string.format("STRUCT,Info,3%s",string.rep(",",maxLineFields-3)))
+            print(string.format("CPU name:,%s%s", cpuinfo["osname"],string.rep(",",maxLineFields-2)))
+            print(string.format("CPU type:,%s%s", cpuinfo["name"],string.rep(",",maxLineFields-2)))
+            print(string.format("CPU clock:,%s GHz%s", clock*1.E-09,string.rep(",",maxLineFields-2)))
+            if region == nil then
+                print(string.format("TABLE,Group %d Raw,%s,%d%s",g,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-4)))
+            else
+                print(string.format("TABLE,Region %s,Group %d Raw,%s,%d%s",regionName,g,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-5)))
+            end
+            if #infotab > 0 then
+                likwid.printcsv(infotab, maxLineFields)
+            end
+            likwid.printcsv(firsttab, maxLineFields)
+        else
+            if outfile ~= nil then
+                print(likwid.hline)
+                print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+                print(string.format("CPU type:\t%s",cpuinfo["name"]))
+                print(string.format("CPU clock:\t%3.2f GHz",clock * 1.E-09))
+                print(likwid.hline)
+            end
+            if region == nil then
+                print("Group "..tostring(g)..": "..groupName)
+            else
+                print("Region "..regionName..", Group "..tostring(g)..": "..groupName)
+            end
+            if #infotab > 0 then
+                likwid.printtable(infotab)
+            end
+            likwid.printtable(firsttab)
+        end
+        if #cur_cpulist > 1 or stats == true then
+            if use_csv then
+                if region == nil then
+                    print(string.format("TABLE,Group %d Raw Stat,%s,%d%s",g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                else
+                    print(string.format("TABLE,Region %s,Group %d Raw Stat,%s,%d%s",regionName, g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-5)))
+                end
+                likwid.printcsv(firsttab_combined, maxLineFields)
+            else
+                likwid.printtable(firsttab_combined)
+            end
+        end
+        if likwid.getNumberOfMetrics(g) > 0 then
+            if use_csv then
+                if region == nil then
+                    print(string.format("TABLE,Group %d Metric,%s,%d%s",g,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-4)))
+                else
+                    print(string.format("TABLE,Region %s,Group %d Metric,%s,%d%s",regionName,g,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-5)))
+                end
+                likwid.printcsv(secondtab, maxLineFields)
+            else
+                likwid.printtable(secondtab)
+            end
+            if #cur_cpulist > 1 or stats == true then
+                if use_csv then
+                    if region == nil then
+                        print(string.format("TABLE,Group %d Metric Stat,%s,%d%s",g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                    else
+                        print(string.format("TABLE,Region %s,Group %d Metric Stat,%s,%d%s",regionName,g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-5)))
+                    end
+                    likwid.printcsv(secondtab_combined, maxLineFields)
+                else
+                    likwid.printtable(secondtab_combined)
+                end
+            end
+        end
+    end
+end
+
+likwid.printOutput = printOutput
+
+
+
+local function getResults()
+    local results = {}
+    local nr_groups = likwid_getNumberOfGroups()
+    local nr_threads = likwid_getNumberOfThreads()
+    for i=1,nr_groups do
+        results[i] = {}
+        local nr_events = likwid_getNumberOfEvents(i)
+        for j=1,nr_events do
+            results[i][j] = {}
+            for k=1, nr_threads do
+                results[i][j][k] = likwid_getResult(i,j,k)
+            end
+        end
+    end
+    return results
+end
+
+likwid.getResults = getResults
+
+local function getLastResults()
+    local results = {}
+    local nr_groups = likwid_getNumberOfGroups()
+    local nr_threads = likwid_getNumberOfThreads()
+    for i=1,nr_groups do
+        results[i] = {}
+        local nr_events = likwid_getNumberOfEvents(i)
+        for j=1,nr_events do
+            results[i][j] = {}
+            for k=1, nr_threads do
+                results[i][j][k] = likwid_getLastResult(i,j,k)
+            end
+        end
+    end
+    return results
+end
+
+likwid.getLastResults = getLastResults
+
+local function getMetrics()
+    local results = {}
+    local nr_groups = likwid_getNumberOfGroups()
+    local nr_threads = likwid_getNumberOfThreads()
+    for i=1,nr_groups do
+        results[i] = {}
+        local nr_metrics = likwid_getNumberOfMetrics(i)
+        for j=1,nr_metrics do
+            results[i][j] = {}
+            for k=1, nr_threads do
+                results[i][j][k] = likwid_getMetric(i,j, k)
+            end
+        end
+    end
+    return results
+end
+
+likwid.getMetrics = getMetrics
+
+local function getLastMetrics()
+    local results = {}
+    local nr_groups = likwid_getNumberOfGroups()
+    local nr_threads = likwid_getNumberOfThreads()
+    for i=1,nr_groups do
+        results[i] = {}
+        local nr_metrics = likwid_getNumberOfMetrics(i)
+        for j=1,nr_metrics do
+            results[i][j] = {}
+            for k=1, nr_threads do
+                results[i][j][k] = likwid_getLastMetric(i,j, k)
+            end
+        end
+    end
+    return results
+end
+
+likwid.getLastMetrics = getLastMetrics
+
+local function getMarkerResults(filename, cpulist)
+    local cpuinfo = likwid.getCpuInfo()
+    likwid.readMarkerFile(filename)
+    results = {}
+    metrics = {}
+    for i=1, likwid.markerNumRegions() do
+        local regionName = likwid.markerRegionTag(i)
+        local groupID = likwid.markerRegionGroup(i)
+        local regionThreads = likwid.markerRegionThreads(i)
+        results[i] = {}
+        metrics[i] = {}
+        results[i][groupID] = {}
+        metrics[i][groupID] = {}
+        for k=1, likwid.markerRegionEvents(i) do
+            local eventName = likwid.getNameOfEvent(groupID, k)
+            local counterName = likwid.getNameOfCounter(groupID, k)
+            results[i][groupID][k] = {}
+            for j=1, regionThreads do
+                results[i][groupID][k][j] = likwid.markerRegionResult(i,k,j)
+            end
+        end
+        if likwid.getNumberOfMetrics(groupID) > 0 then
+            for k=1, likwid.getNumberOfMetrics(likwid.markerRegionGroup(i)) do
+                local metricName = likwid.getNameOfMetric(groupID, k)
+                metrics[i][groupID][k] = {}
+                for j=1, regionThreads do
+                    metrics[i][groupID][k][j] = likwid.markerRegionMetric(i,k,j)
+                end
+            end
+        end
+    end
+    return results, metrics
+end
+
+likwid.getMarkerResults = getMarkerResults
+
+
+local function msr_available(flags)
+    local ret = likwid_access("/dev/cpu/0/msr", flags)
+    if ret == 0 then
+        return true
+    else
+        local ret = likwid_access("/dev/msr0", flags)
+        if ret == 0 then
+            return true
+        end
+    end
+    return false
+end
+likwid.msr_available = msr_available
+
+
+local function addSimpleAsciiBox(container,lineIdx, colIdx, label)
+    local box = {}
+    if container[lineIdx] == nil then
+        container[lineIdx] = {}
+    end
+    box["width"] = 1
+    box["label"] = label
+    table.insert(container[lineIdx], box)
+end
+likwid.addSimpleAsciiBox = addSimpleAsciiBox
+
+local function addJoinedAsciiBox(container,lineIdx, startColIdx, endColIdx, label)
+    local box = {}
+    if container[lineIdx] == nil then
+        container[lineIdx] = {}
+    end
+    box["width"] = endColIdx-startColIdx+1
+    box["label"] = label
+    table.insert(container[lineIdx], box)
+end
+likwid.addJoinedAsciiBox = addJoinedAsciiBox
+
+local function printAsciiBox(container)
+    local boxwidth = 0
+    local numLines = #container
+    local maxNumColumns = 0
+    for i=1,numLines do
+        if #container[i] > maxNumColumns then
+            maxNumColumns = #container[i]
+        end
+        for j=1,#container[i] do
+            if container[i][j]["label"]:len() > boxwidth then
+                boxwidth = container[i][j]["label"]:len()
+            end
+        end
+    end
+    boxwidth = boxwidth + 2
+    boxline = "+" .. string.rep("-",((maxNumColumns * (boxwidth+2)) + maxNumColumns+1)) .. "+"
+    print(boxline)
+    for i=1,numLines do
+        innerboxline = "| "
+        local numColumns = #container[i]
+        for j=1,numColumns do
+            innerboxline = innerboxline .. "+"
+            if container[i][j]["width"] == 1 then
+                innerboxline = innerboxline .. string.rep("-", boxwidth)
+            else
+                innerboxline = innerboxline .. string.rep("-", (container[i][j]["width"] * boxwidth + (container[i][j]["width"]-1)*3))
+            end
+            innerboxline = innerboxline .. "+ "
+        end
+        
+        boxlabelline = "| "
+        for j=1,numColumns do
+            local offset = 0
+            local width = 0
+            local labellen = container[i][j]["label"]:len()
+            local boxlen = container[i][j]["width"]
+            if container[i][j]["width"] == 1 then
+                width = (boxwidth - labellen)/2;
+                offset = (boxwidth - labellen)%2;
+            else
+                width = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)/2;
+                offset = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)%2;
+            end
+            boxlabelline = boxlabelline .. "|" .. string.rep(" ", math.floor(width+offset))
+            boxlabelline = boxlabelline .. container[i][j]["label"]
+            boxlabelline = boxlabelline ..  string.rep(" ",math.floor(width)) .. "| "
+        end
+        print(innerboxline .. "|")
+        print(boxlabelline .. "|")
+        print(innerboxline .. "|")
+    end
+    print(boxline)
+end
+likwid.printAsciiBox = printAsciiBox
+
+-- Some helpers for output file substitutions
+-- getpid already defined by Lua-C-Interface
+local function gethostname()
+    local f = io.popen("hostname -s","r")
+    local hostname = f:read("*all"):gsub("^%s*(.-)%s*$", "%1")
+    f:close()
+    return hostname
+end
+
+likwid.gethostname = gethostname
+
+local function getjid()
+    local jid = os.getenv("PBS_JOBID")
+    if jid == nil then
+        jid = "X"
+    end
+    return jid
+end
+
+likwid.getjid = getjid
+
+local function getMPIrank()
+    local rank = os.getenv("PMI_RANK")
+    if rank == nil then
+        rank = os.getenv("OMPI_COMM_WORLD_RANK")
+        if rank == nil then
+            rank = "X"
+        end
+    end
+    return rank
+end
+
+likwid.getMPIrank = getMPIrank
+
+return likwid
diff --git a/src/asciiBoxes.c b/src/asciiBoxes.c
deleted file mode 100644
index a22dab5..0000000
--- a/src/asciiBoxes.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes.c
- *
- *      Description:  Module implementing output of nested ascii art boxes
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <asciiBoxes.h>
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-BoxContainer*
-asciiBoxes_allocateContainer(int numLines, int numColumns)
-{
-    BoxContainer* container;
-
-    container = (BoxContainer*) malloc(sizeof(BoxContainer));
-    container->numLines = numLines;
-    container->numColumns = numColumns;
-
-    container->boxes = (Box**) malloc(numLines * sizeof(Box*));
-
-    for ( int i=0; i < numLines; i++ )
-    {
-        container->boxes[i] = (Box*) malloc(numColumns * sizeof(Box));
-    }
-
-    for(int i=0; i<numLines; i++)
-    {
-        for(int j=0; j<numColumns; j++)
-        {
-            container->boxes[i][j].width = 0;
-            container->boxes[i][j].label = NULL;
-        }
-    }
-
-    return container;
-}
-
-void 
-asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label)
-{
-    if ( line >= container->numLines )
-    {
-        ERROR_PRINT(line id %d too large,line);
-    }
-    if ( column >= container->numColumns )
-    {
-        ERROR_PRINT(column id %d too large,column);
-    }
-
-    container->boxes[line][column].width = 1;
-    container->boxes[line][column].label = bstrcpy(label);
-}
-
-
-void
-asciiBoxes_addJoinedBox(
-        BoxContainer* container,
-        int line,
-        int startColumn,
-        int endColumn,
-        bstring label)
-{
-    if ( line >= container->numLines )
-    {
-        ERROR_PRINT(line id %d too large,line);
-    }
-
-    if ( endColumn >= container->numColumns )
-    {
-        ERROR_PRINT(column id %d too large,endColumn);
-    }
-
-    container->boxes[line][startColumn].width = (endColumn-startColumn)+1;
-    container->boxes[line][startColumn].label = bstrcpy(label);
-}
-
-void
-asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container)
-{
-    int width;
-    int boxwidth=0; /* box width is inner width of box */
-
-    /* determine maximum label width */
-    for ( int i=0; i < container->numLines; i++ )
-    {
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            btrimws(container->boxes[i][j].label);
-            boxwidth = MAX(boxwidth,blength(container->boxes[i][j].label));
-
-            /* if box is joined increase counter */
-            if ( container->boxes[i][j].width > 1 )
-            {
-                j +=  container->boxes[i][j].width;
-            }
-        }
-    }
-    boxwidth += 2;  /* add one space each side */
-
-    /* top line */
-    printf("+");
-
-    for ( int i=0; i < (container->numColumns * (boxwidth+2) +
-                (container->numColumns+1));  /* one space between boxes */
-            i++ )
-    {
-        printf("-");
-    }
-    printf("+\n");
-
-    for ( int i=0; i < container->numLines; i++ )
-    {
-        /* Box top line */
-        printf("| ");
-
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            printf("+");
-
-            if ( container->boxes[i][j].width == 1 )
-            {
-                for ( int k=0; k < boxwidth; k++ )
-                {
-                    printf("-");
-                }
-            }
-            else 
-            {
-                for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
-                            (container->boxes[i][j].width-1)*3);
-                        k++)
-                {
-                    printf("-");
-                }
-                j += container->boxes[i][j].width-1;
-            }
-            printf("+ ");
-        }
-        printf("|\n");
-        printf("| ");
-
-        /* Box label line */
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            int offset=0;
-
-            /* center label */
-            if ( container->boxes[i][j].width == 1 )
-            {
-                width = (boxwidth - blength(container->boxes[i][j].label))/2;
-                offset = (boxwidth - blength(container->boxes[i][j].label))%2;
-            }
-            else
-            {
-                width = (container->boxes[i][j].width * boxwidth +
-                        ((container->boxes[i][j].width-1)*3) -
-                        blength(container->boxes[i][j].label))/2;
-
-                offset = (container->boxes[i][j].width * boxwidth +
-                        ((container->boxes[i][j].width-1)*3) -
-                        blength(container->boxes[i][j].label))%2;
-            }
-            printf("|");
-
-            for ( int k=0; k < (width+offset); k++ )
-            {
-                printf(" ");
-            }
-
-            printf("%s",container->boxes[i][j].label->data);
-
-            for ( int k=0; k < width; k++ )
-            {
-                printf(" ");
-            }
-            printf("| ");
-
-            if ( container->boxes[i][j].width != 1 )
-            {
-                j+= container->boxes[i][j].width-1;
-            }
-        }
-        printf("|\n");
-        printf("| ");
-
-        /* Box bottom line */
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            printf("+");
-
-            if ( container->boxes[i][j].width == 1 )
-            {
-                for ( int k=0; k < boxwidth; k++ )
-                {
-                    printf("-");
-                }
-            }
-            else 
-            {
-                for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
-                            (container->boxes[i][j].width-1)*3);
-                        k++ )
-                {
-                    printf("-");
-                }
-                j+= container->boxes[i][j].width-1;
-            }
-            printf("+ ");
-        }
-        printf("|\n");
-    }
-
-    /* bottom line */
-    printf("+");
-    for ( int i=0; i < (container->numColumns * (boxwidth+2) + 
-                container->numColumns+1); i++ )
-    {
-        printf("-");
-    }
-    printf("+\n");
-    fflush(stdout);
-}
-
diff --git a/src/asciiTable.c b/src/asciiTable.c
deleted file mode 100644
index 29b615a..0000000
--- a/src/asciiTable.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable.c
- *
- *      Description:  Module implementing output of ascii table.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <strUtil.h>
-#include <asciiTable.h>
-
-/* #####   LOCAL VARIABLES   ########################################### */
-
-static FILE* OUTPUT;
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-asciiTable_setOutput(FILE* stream)
-{
-    OUTPUT = stream;
-}
-
-TableContainer*
-asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels)
-{
-    int i;
-    TableContainer* container;
-    OUTPUT = stdout;
-
-    container = (TableContainer*) malloc(sizeof(TableContainer));
-    container->numRows = numRows;
-    container->numColumns = numColumns;
-    container->currentRow = 0;
-    container->printed = 0;
-
-    if (numColumns != headerLabels->qty)
-    {
-        ERROR_PRINT(Number of columns %d not equal to number of header labels %d,numColumns,headerLabels->qty);
-    }
-
-    container->header = bstrListCreate();
-    bstrListAlloc (container->header, numColumns);
-
-    for(i=0; i<numColumns; i++)
-    {
-        container->header->entry[i] = bstrcpy(headerLabels->entry[i]);
-    }
-
-    container->rows = (bstrList**) malloc( numRows * sizeof(bstrList*));
-
-    for(i=0; i<numRows; i++)
-    {
-        container->rows[i] = bstrListCreate();
-        bstrListAlloc (container->rows[i], numColumns);
-    }
-
-    return container;
-}
-
-void 
-asciiTable_free(TableContainer* container)
-{
-    int i;
-
-    if(container == NULL)
-    {
-        ERROR_PLAIN_PRINT(Cannot free NULL reference);
-    }
-
-    bstrListDestroy(container->header);
-
-    for(i=0; i<container->numRows; i++)
-    {
-        bstrListDestroy(container->rows[i]);
-    }
-
-    free(container->rows);
-}
-
-void
-asciiTable_insertRow(TableContainer* container, int row, bstrList* fields)
-{
-    int i;
-
-    if (container->numColumns != fields->qty)
-    {
-        ERROR_PRINT(Number of colummns %d not equal to number of field labels %d,container->numColumns,fields->qty);
-    }
-
-    if (row >= container->numRows)
-    {
-        ERROR_PRINT(Number of Rows %d smaller than requested row index %d, container->numRows,row);
-    }
-
-    for(i=0; i<container->numColumns; i++)
-    {
-        container->rows[row]->entry[i] = bstrcpy(fields->entry[i]);
-        container->rows[row]->qty++;
-    }
-}
-
-void
-asciiTable_appendRow(TableContainer* container, bstrList* fields)
-{
-    asciiTable_insertRow(container, container->currentRow++, fields);
-}
-
-void
-asciiTable_setCurrentRow(TableContainer* container, int row)
-{
-    container->currentRow = row;
-}
-
-void
-asciiTable_print(TableContainer* container)
-{
-    int i;
-    int j;
-    int* boxwidth;
-
-    boxwidth = (int*) malloc(container->numColumns * sizeof(int));
-
-    for (j=0; j<container->numColumns; j++) boxwidth[j] = 0;
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        boxwidth[j] = MAX(boxwidth[j],blength(container->header->entry[j]));
-    }
-
-    /* determine maximum label width in each column */
-    for (i=0; i<container->numRows; i++)
-    {
-        for (j=0; j<container->numColumns; j++)
-        {
-            //           btrimws(container->rows[i]->entry[j]);
-            boxwidth[j] = MAX(boxwidth[j],blength(container->rows[i]->entry[j]));
-        }
-    }
-
-    if (! container->printed)
-    {
-        /* Increase boxwidth with two spaces */
-        for (j=0; j<container->numColumns; j++) boxwidth[j] +=2;
-    }
-
-    /* print header */
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"|");
-        bJustifyCenter(container->header->entry[j],boxwidth[j]);
-        fprintf(OUTPUT,"%s",bdata(container->header->entry[j]));
-    }
-    fprintf(OUTPUT,"|\n");
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-
-    for (i=0; i<container->numRows; i++)
-    {
-        for (j=0; j<container->numColumns; j++)
-        {
-            fprintf(OUTPUT,"|");
-            bJustifyCenter(container->rows[i]->entry[j],boxwidth[j]);
-            fprintf(OUTPUT,"%s",bdata(container->rows[i]->entry[j]));
-        }
-        fprintf(OUTPUT,"|\n");
-    }
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-    container->printed = 1;
-
-    free(boxwidth);
-}
-
-
-
-
diff --git a/src/barrier.c b/src/barrier.c
deleted file mode 100644
index 3a93f92..0000000
--- a/src/barrier.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier.c
- *
- *      Description:  Implementation of threaded spin loop barrier
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <barrier.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define CACHELINE_SIZE 64
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static BarrierGroup* groups;
-static int currentGroupId = 0;
-static int maxGroupId = 0;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int
-barrier_registerGroup(int numThreads)
-{
-    int ret;
-
-    if (currentGroupId > maxGroupId)
-    {
-        ERROR_PRINT(Group ID %d larger than maxGroupID %d,currentGroupId,maxGroupId);
-    }
-
-    groups[currentGroupId].numberOfThreads = numThreads;
-    ret = posix_memalign(
-            (void**) &groups[currentGroupId].groupBval,
-            CACHELINE_SIZE, 
-            numThreads * 32 * sizeof(int));
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
-
-
-    return currentGroupId++;
-}
-
-void
-barrier_registerThread(BarrierData* barr, int groupId, int threadId)
-{
-    int ret;
-    int i;
-    int j = 1;
-    if (groupId > currentGroupId)
-    {
-        ERROR_PLAIN_PRINT(Group not yet registered);
-    }
-    if (threadId > groups[groupId].numberOfThreads)
-    {
-        ERROR_PRINT(Thread ID %d too large,threadId);
-    }
-
-    barr->numberOfThreads = groups[groupId].numberOfThreads;
-    barr->offset = 0;
-    barr->val = 1;
-    barr->bval =  groups[groupId].groupBval;
-    ret = posix_memalign(
-            (void**) &(barr->index),
-            CACHELINE_SIZE, 
-            barr->numberOfThreads * sizeof(int));
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
-
-
-    barr->index[0] = threadId;
-
-    for (i = 0; i < barr->numberOfThreads; i++)
-    {
-        if (!(i == threadId))
-        {
-            barr->index[j++] = i;
-        }
-    }
-}
-
-
-void
-barrier_init(int numberOfGroups) 
-{
-    maxGroupId = numberOfGroups-1;
-    groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
-}
-
-void
-barrier_synchronize(BarrierData* barr)
-{
-    int i;
-
-    barr->bval[barr->index[0] * 32 +  barr->offset * 16] = barr->val;
-
-    for (i = 1; i < barr->numberOfThreads; i++)
-    {
-        while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
-        {
-            __asm__ ("pause");
-        }
-    } 
-    
-    if (barr->offset)
-    {
-        barr->val = !barr->val;
-    }
-    barr->offset = !barr->offset;
-}
-
-void barrier_destroy(void)
-{
-    free(groups);
-}
diff --git a/src/bench.c b/src/bench.c
deleted file mode 100644
index 3a0b81b..0000000
--- a/src/bench.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  bench.c
- *
- *      Description:  Benchmarking framework for likwid-bench
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <sched.h>
-#include <types.h>
-#include <unistd.h>
-
-#include <timer.h>
-#include <threads.h>
-#include <affinity.h>
-#include <barrier.h>
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#endif
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-//#define BARRIER pthread_barrier_wait(&threads_barrier)
-#define BARRIER   barrier_synchronize(&barr)
-
-#ifdef PERFMON
-#define START_PERFMON likwid_markerStartRegion("bench");
-#define STOP_PERFMON  likwid_markerStopRegion("bench");
-#define LIKWID_THREAD_INIT  likwid_markerThreadInit();
-#define EXECUTE EXECUTE_LIKWID
-#else
-#ifdef PAPI
-#define START_PERFMON(event_set) PAPI_start(event_set);
-#define STOP_PERFMON(event_set, result) PAPI_stop ( event_set ,result );
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_PAPI
-#else
-#define START_PERFMON
-#define STOP_PERFMON
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_LIKWID
-#endif
-#endif
-
-#define EXECUTE_LIKWID(func)   \
-    BARRIER; \
-    if (data->threadId == 0) \
-    { \
-        timer_start(&time); \
-    } \
-    START_PERFMON  \
-    for (i=0; i<  data->data.iter; i++) \
-    {   \
-    func; \
-    } \
-    BARRIER; \
-    STOP_PERFMON  \
-    if (data->threadId == 0) \
-    { \
-        timer_stop(&time); \
-        data->cycles = timer_printCycles(&time); \
-    } \
-    BARRIER 
-
-#define EXECUTE_PAPI(func)   \
-    BARRIER; \
-    if (data->threadId == 0) \
-    { \
-        timer_start(&time); \
-    } \
-    START_PERFMON(event_set)  \
-    for (i=0; i<  data->data.iter; i++) \
-    {   \
-    func; \
-    } \
-    BARRIER; \
-    STOP_PERFMON(event_set, &(result[0]))  \
-    if (data->threadId == 0) \
-    { \
-        timer_stop(&time); \
-        data->cycles = timer_printCycles(&time); \
-    } \
-    BARRIER
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void* runTest(void* arg)
-{
-    int threadId;
-    int offset;
-    size_t size;
-    size_t i;
-    BarrierData barr;
-    ThreadData* data;
-    ThreadUserData* myData;
-    TimerData time;
-    FuncPrototype func;
-    FILE* OUTSTREAM;
-#ifdef PAPI
-    int event_set = PAPI_NULL;
-    char groupname[50];
-    char* group_ptr = &(groupname[0]);
-    long long int result[4] = {0,0,0,0};
-    group_ptr = getenv("PAPI_BENCH");
-    PAPI_create_eventset(&event_set);
-    PAPI_add_event(event_set, PAPI_TOT_CYC);
-    // L3 group
-    if (strncmp(group_ptr,"L3",2) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_L3_TCA);
-    }
-    // L2 group
-    else if (strncmp(group_ptr,"L2",2) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_L2_TCA);
-    }
-    // FLOPS_AVX
-    else if (strncmp(group_ptr,"FLOPS_AVX",9) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_VEC_SP);
-        PAPI_add_event(event_set, PAPI_VEC_DP);
-        PAPI_add_event(event_set, PAPI_FP_INS);
-    }
-    // FLOPS_DP
-    else if (strncmp(group_ptr,"FLOPS_DP",8) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_DP_OPS);
-    }
-    // FLOPS_SP
-    else if (strncmp(group_ptr,"FLOPS_SP",8) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_SP_OPS);
-    }
-#endif
-
-    data = (ThreadData*) arg;
-    myData = &(data->data);
-    func = myData->test->kernel;
-    threadId = data->threadId;
-    OUTSTREAM = data->output;
-    barrier_registerThread(&barr, 0, data->globalThreadId);
-
-    /* Prepare ptrs for thread */
-    size = myData->size / data->numberOfThreads;
-    size -= (size%myData->test->stride);
-    offset = data->threadId * size;
-    myData->size = size;
-
-    switch ( myData->test->type )
-    {
-    	case SINGLE_RAND:
-        case SINGLE:
-            {
-                float* sptr;
-                for (i=0; i <  myData->test->streams; i++)
-                {
-                    sptr = (float*) myData->streams[i];
-                    sptr +=  offset;
-              //      sptr +=  size;
-                    myData->streams[i] = (float*) sptr;
-                }
-            }
-            break;
-        case DOUBLE_RAND:
-        case DOUBLE:
-            {
-                double* dptr;
-                for (i=0; i <  myData->test->streams; i++)
-                {
-                    dptr = (double*) myData->streams[i];
-                    dptr +=  offset;
-             //       dptr +=  size;
-                    myData->streams[i] = (double*) dptr;
-                }
-            }
-            break;
-    }
-
-    /* pint the thread */
-    affinity_pinThread(myData->processors[threadId]);
-
-    sleep(1);
-    LIKWID_THREAD_INIT;
-    BARRIER;
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
-                data->groupId,
-                threadId,
-                data->globalThreadId,
-                affinity_threadGetProcessorId(),
-                LLU_CAST size,
-                offset);
-    }
-    BARRIER;
-
-    /* Up to 10 streams the following registers are used for Array ptr:
-     * Size rdi
-     * in Registers: rsi  rdx  rcx  r8  r9
-     * passed on stack, then: r10  r11  r12  r13  r14  r15
-     * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
-     * load them from stack
-     * */
-
-    switch ( myData->test->streams ) {
-        case STREAM_1:
-            EXECUTE(func(size,myData->streams[0]));
-            break;
-        case STREAM_2:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1]));
-            break;
-        case STREAM_3:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
-            break;
-        case STREAM_4:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
-            break;
-        case STREAM_5:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4]));
-            break;
-        case STREAM_6:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5]));
-            break;
-        case STREAM_7:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6]));
-            break;
-        case STREAM_8:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
-            break;
-        case STREAM_9:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8]));
-            break;
-        case STREAM_10:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9]));
-            break;
-        case STREAM_11:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10]));
-            break;
-        case STREAM_12:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
-            break;
-        case STREAM_13:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12]));
-            break;
-        case STREAM_14:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13]));
-            break;
-        case STREAM_15:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14]));
-            break;
-        case STREAM_16:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
-            break;
-        case STREAM_17:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16]));
-            break;
-        case STREAM_18:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17]));
-            break;
-        case STREAM_19:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18]));
-            break;
-        case STREAM_20:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
-            break;
-        case STREAM_21:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20]));
-            break;
-        case STREAM_22:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21]));
-            break;
-        case STREAM_23:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22]));
-            break;
-        case STREAM_24:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
-            break;
-        case STREAM_25:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24]));
-            break;
-        case STREAM_26:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25]));
-            break;
-        case STREAM_27:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26]));
-            break;
-        case STREAM_28:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
-            break;
-        case STREAM_29:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28]));
-            break;
-        case STREAM_30:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29]));
-            break;
-        case STREAM_31:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30]));
-            break;
-        case STREAM_32:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
-            break;
-        case STREAM_33:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32]));
-            break;
-        case STREAM_34:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33]));
-            break;
-        case STREAM_35:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34]));
-            break;
-        case STREAM_36:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
-            break;
-        case STREAM_37:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
-                        myData->streams[36]));
-            break;
-        case STREAM_38:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
-                        myData->streams[36],myData->streams[37]));
-            break;
-        default:
-            break;
-    }
-#ifdef PAPI
-    double papi_result = 0.0;
-    // L2 & L3 group
-    if (strncmp(group_ptr,"L3",2) == 0 ||
-        strncmp(group_ptr,"L2",2) == 0)
-    {
-        papi_result = ((double)result[1]) * 64.0;
-    }
-    // FLOPS_AVX
-    else if (strncmp(group_ptr,"FLOPS",5) == 0)
-    {
-        papi_result = (double) result[1]+ (double) result[2];
-    }
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Thread %d Result %f\n",threadId, papi_result);
-    }
-#endif
-    pthread_exit(NULL);
-}
-
-
diff --git a/src/bitUtil.c b/src/bitUtil.c
index cdce490..099626c 100644
--- a/src/bitUtil.c
+++ b/src/bitUtil.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Utility routines manipulating bit arrays.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -72,18 +72,18 @@ extractBitField(uint32_t inField, uint32_t width, uint32_t offset)
 uint32_t
 getBitFieldWidth(uint32_t number)
 {
-    uint32_t fieldWidth;
+    uint32_t fieldWidth=0;
 
     number--;
     if (number == 0)
     {
         return 0;
     }
-
+#ifdef __x86_64
     __asm__ volatile ( "bsr %%eax, %%ecx\n\t"
             : "=c" (fieldWidth)
             : "a"(number));
-
+#endif
 
     return fieldWidth+1;  /* bsr returns the position, we want the width */
 }
diff --git a/src/bstrlib.c b/src/bstrlib.c
index 52f5a99..380269c 100644
--- a/src/bstrlib.c
+++ b/src/bstrlib.c
@@ -64,27 +64,27 @@
 /* Compute the snapped size for a given requested size.  By snapping to powers
    of 2 like this, repeated reallocations are avoided. */
 static int snapUpSize (int i) {
-	if (i < 8) {
-		i = 8;
-	} else {
-		unsigned int j;
-		j = (unsigned int) i;
-
-		j |= (j >>  1);
-		j |= (j >>  2);
-		j |= (j >>  4);
-		j |= (j >>  8);		/* Ok, since int >= 16 bits */
+    if (i < 8) {
+        i = 8;
+    } else {
+        unsigned int j;
+        j = (unsigned int) i;
+
+        j |= (j >>  1);
+        j |= (j >>  2);
+        j |= (j >>  4);
+        j |= (j >>  8);        /* Ok, since int >= 16 bits */
 #if (UINT_MAX != 0xffff)
-		j |= (j >> 16);		/* For 32 bit int systems */
+        j |= (j >> 16);        /* For 32 bit int systems */
 #if (UINT_MAX > 0xffffffffUL)
-		j |= (j >> 32);		/* For 64 bit int systems */
+        j |= (j >> 32);        /* For 64 bit int systems */
 #endif
 #endif
-		/* Least power of two greater than i */
-		j++;
-		if ((int) j >= i) i = (int) j;
-	}
-	return i;
+        /* Least power of two greater than i */
+        j++;
+        if ((int) j >= i) i = (int) j;
+    }
+    return i;
 }
 
 /*  int balloc (bstring b, int len)
@@ -92,59 +92,59 @@ static int snapUpSize (int i) {
  *  Increase the size of the memory backing the bstring b to at least len.
  */
 int balloc (bstring b, int olen) {
-	int len;
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
-	    b->mlen < b->slen || olen <= 0) {
-		return BSTR_ERR;
-	}
+    int len;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || olen <= 0) {
+        return BSTR_ERR;
+    }
 
-	if (olen >= b->mlen) {
-		unsigned char * x;
+    if (olen >= b->mlen) {
+        unsigned char * x;
 
-		if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+        if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
 
-		/* Assume probability of a non-moving realloc is 0.125 */
-		if (7 * b->mlen < 8 * b->slen) {
+        /* Assume probability of a non-moving realloc is 0.125 */
+        if (7 * b->mlen < 8 * b->slen) {
 
-			/* If slen is close to mlen in size then use realloc to reduce
-			   the memory defragmentation */
+            /* If slen is close to mlen in size then use realloc to reduce
+               the memory defragmentation */
 
-			reallocStrategy:;
+            reallocStrategy:;
 
-			x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-			if (x == NULL) {
+            x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+            if (x == NULL) {
 
-				/* Since we failed, try allocating the tighest possible 
-				   allocation */
+                /* Since we failed, try allocating the tighest possible 
+                   allocation */
 
-				if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
-					return BSTR_ERR;
-				}
-			}
-		} else {
+                if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+                    return BSTR_ERR;
+                }
+            }
+        } else {
 
-			/* If slen is not close to mlen then avoid the penalty of copying
-			   the extra bytes that are allocated, but not considered part of
-			   the string */
+            /* If slen is not close to mlen then avoid the penalty of copying
+               the extra bytes that are allocated, but not considered part of
+               the string */
 
-			if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+            if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
 
-				/* Perhaps there is no available memory for the two 
-				   allocations to be in memory at once */
+                /* Perhaps there is no available memory for the two 
+                   allocations to be in memory at once */
 
-				goto reallocStrategy;
+                goto reallocStrategy;
 
-			} else {
-				if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
-				bstr__free (b->data);
-			}
-		}
-		b->data = x;
-		b->mlen = len;
-		b->data[b->slen] = (unsigned char) '\0';
-	}
+            } else {
+                if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+                bstr__free (b->data);
+            }
+        }
+        b->data = x;
+        b->mlen = len;
+        b->data[b->slen] = (unsigned char) '\0';
+    }
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  int ballocmin (bstring b, int len)
@@ -154,24 +154,24 @@ int balloc (bstring b, int olen) {
  *  performance.
  */
 int ballocmin (bstring b, int len) {
-	unsigned char * s;
+    unsigned char * s;
 
-	if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
-	    b->mlen < b->slen || len <= 0) {
-		return BSTR_ERR;
-	}
+    if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || len <= 0) {
+        return BSTR_ERR;
+    }
 
-	if (len < b->slen + 1) len = b->slen + 1;
+    if (len < b->slen + 1) len = b->slen + 1;
 
-	if (len != b->mlen) {
-		s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-		if (NULL == s) return BSTR_ERR;
-		s[b->slen] = (unsigned char) '\0';
-		b->data = s;
-		b->mlen = len;
-	}
+    if (len != b->mlen) {
+        s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+        if (NULL == s) return BSTR_ERR;
+        s[b->slen] = (unsigned char) '\0';
+        b->data = s;
+        b->mlen = len;
+    }
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  bstring bfromcstr (const char * str)
@@ -184,21 +184,21 @@ bstring b;
 int i;
 size_t j;
 
-	if (str == NULL) return NULL;
-	j = (strlen) (str);
-	i = snapUpSize ((int) (j + (2 - (j != 0))));
-	if (i <= (int) j) return NULL;
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
 
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (NULL == b) return NULL;
-	b->slen = (int) j;
-	if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-		bstr__free (b);
-		return NULL;
-	}
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL;
+    b->slen = (int) j;
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	bstr__memcpy (b->data, str, j+1);
-	return b;
+    bstr__memcpy (b->data, str, j+1);
+    return b;
 }
 
 /*  bstring bfromcstralloc (int mlen, const char * str)
@@ -212,23 +212,23 @@ bstring b;
 int i;
 size_t j;
 
-	if (str == NULL) return NULL;
-	j = (strlen) (str);
-	i = snapUpSize ((int) (j + (2 - (j != 0))));
-	if (i <= (int) j) return NULL;
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
 
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b == NULL) return NULL;
-	b->slen = (int) j;
-	if (i < mlen) i = mlen;
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = (int) j;
+    if (i < mlen) i = mlen;
 
-	if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-		bstr__free (b);
-		return NULL;
-	}
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	bstr__memcpy (b->data, str, j+1);
-	return b;
+    bstr__memcpy (b->data, str, j+1);
+    return b;
 }
 
 /*  bstring blk2bstr (const void * blk, int len)
@@ -240,26 +240,26 @@ bstring blk2bstr (const void * blk, int len) {
 bstring b;
 int i;
 
-	if (blk == NULL || len < 0) return NULL;
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b == NULL) return NULL;
-	b->slen = len;
+    if (blk == NULL || len < 0) return NULL;
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = len;
 
-	i = len + (2 - (len != 0));
-	i = snapUpSize (i);
+    i = len + (2 - (len != 0));
+    i = snapUpSize (i);
 
-	b->mlen = i;
+    b->mlen = i;
 
-	b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
-	if (b->data == NULL) {
-		bstr__free (b);
-		return NULL;
-	}
+    b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
-	b->data[len] = (unsigned char) '\0';
+    if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+    b->data[len] = (unsigned char) '\0';
 
-	return b;
+    return b;
 }
 
 /*  char * bstr2cstr (const_bstring s, char z)
@@ -273,18 +273,18 @@ char * bstr2cstr (const_bstring b, char z) {
 int i, l;
 char * r;
 
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
-	l = b->slen;
-	r = (char *) bstr__alloc ((size_t) (l + 1));
-	if (r == NULL) return r;
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    l = b->slen;
+    r = (char *) bstr__alloc ((size_t) (l + 1));
+    if (r == NULL) return r;
 
-	for (i=0; i < l; i ++) {
-		r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
-	}
+    for (i=0; i < l; i ++) {
+        r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+    }
 
-	r[l] = (unsigned char) '\0';
+    r[l] = (unsigned char) '\0';
 
-	return r;
+    return r;
 }
 
 /*  int bcstrfree (char * s)
@@ -299,11 +299,11 @@ char * r;
  *  redefinitions.
  */
 int bcstrfree (char * s) {
-	if (s) {
-		bstr__free (s);
-		return BSTR_OK;
-	}
-	return BSTR_ERR;
+    if (s) {
+        bstr__free (s);
+        return BSTR_OK;
+    }
+    return BSTR_ERR;
 }
 
 /*  int bconcat (bstring b0, const_bstring b1)
@@ -314,28 +314,28 @@ int bconcat (bstring b0, const_bstring b1) {
 int len, d;
 bstring aux = (bstring) b1;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
 
-	d = b0->slen;
-	len = b1->slen;
-	if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+    d = b0->slen;
+    len = b1->slen;
+    if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
 
-	if (b0->mlen <= d + len + 1) {
-		ptrdiff_t pd = b1->data - b0->data;
-		if (0 <= pd && pd < b0->mlen) {
-			if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-		}
-		if (balloc (b0, d + len + 1) != BSTR_OK) {
-			if (aux != b1) bdestroy (aux);
-			return BSTR_ERR;
-		}
-	}
+    if (b0->mlen <= d + len + 1) {
+        ptrdiff_t pd = b1->data - b0->data;
+        if (0 <= pd && pd < b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        if (balloc (b0, d + len + 1) != BSTR_OK) {
+            if (aux != b1) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
 
-	bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
-	b0->data[d + len] = (unsigned char) '\0';
-	b0->slen = d + len;
-	if (aux != b1) bdestroy (aux);
-	return BSTR_OK;
+    bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+    b0->data[d + len] = (unsigned char) '\0';
+    b0->slen = d + len;
+    if (aux != b1) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int bconchar (bstring b, char c)
@@ -345,13 +345,13 @@ bstring aux = (bstring) b1;
 int bconchar (bstring b, char c) {
 int d;
 
-	if (b == NULL) return BSTR_ERR;
-	d = b->slen;
-	if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-	b->data[d] = (unsigned char) c;
-	b->data[d + 1] = (unsigned char) '\0';
-	b->slen++;
-	return BSTR_OK;
+    if (b == NULL) return BSTR_ERR;
+    d = b->slen;
+    if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+    b->data[d] = (unsigned char) c;
+    b->data[d + 1] = (unsigned char) '\0';
+    b->slen++;
+    return BSTR_OK;
 }
 
 /*  int bcatcstr (bstring b, const char * s)
@@ -362,22 +362,22 @@ int bcatcstr (bstring b, const char * s) {
 char * d;
 int i, l;
 
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-	 || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL) return BSTR_ERR;
 
-	/* Optimistically concatenate directly */
-	l = b->mlen - b->slen;
-	d = (char *) &b->data[b->slen];
-	for (i=0; i < l; i++) {
-		if ((*d++ = *s++) == '\0') {
-			b->slen += i;
-			return BSTR_OK;
-		}
-	}
-	b->slen += i;
+    /* Optimistically concatenate directly */
+    l = b->mlen - b->slen;
+    d = (char *) &b->data[b->slen];
+    for (i=0; i < l; i++) {
+        if ((*d++ = *s++) == '\0') {
+            b->slen += i;
+            return BSTR_OK;
+        }
+    }
+    b->slen += i;
 
-	/* Need to explicitely resize and concatenate tail */
-	return bcatblk (b, (const void *) s, (int) strlen (s));
+    /* Need to explicitely resize and concatenate tail */
+    return bcatblk (b, (const void *) s, (int) strlen (s));
 }
 
 /*  int bcatblk (bstring b, const void * s, int len)
@@ -387,16 +387,16 @@ int i, l;
 int bcatblk (bstring b, const void * s, int len) {
 int nl;
 
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-	 || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
 
-	if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
-	if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+    if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+    if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
 
-	bBlockCopy (&b->data[b->slen], s, (size_t) len);
-	b->slen = nl;
-	b->data[nl] = (unsigned char) '\0';
-	return BSTR_OK;
+    bBlockCopy (&b->data[b->slen], s, (size_t) len);
+    b->slen = nl;
+    b->data[nl] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  bstring bstrcpy (const_bstring b)
@@ -407,36 +407,36 @@ bstring bstrcpy (const_bstring b) {
 bstring b0;
 int i,j;
 
-	/* Attempted to copy an invalid string? */
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    /* Attempted to copy an invalid string? */
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
 
-	b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b0 == NULL) {
-		/* Unable to allocate memory for string header */
-		return NULL;
-	}
+    b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b0 == NULL) {
+        /* Unable to allocate memory for string header */
+        return NULL;
+    }
 
-	i = b->slen;
-	j = snapUpSize (i + 1);
+    i = b->slen;
+    j = snapUpSize (i + 1);
 
-	b0->data = (unsigned char *) bstr__alloc (j);
-	if (b0->data == NULL) {
-		j = i + 1;
-		b0->data = (unsigned char *) bstr__alloc (j);
-		if (b0->data == NULL) {
-			/* Unable to allocate memory for string data */
-			bstr__free (b0);
-			return NULL;
-		}
-	}
+    b0->data = (unsigned char *) bstr__alloc (j);
+    if (b0->data == NULL) {
+        j = i + 1;
+        b0->data = (unsigned char *) bstr__alloc (j);
+        if (b0->data == NULL) {
+            /* Unable to allocate memory for string data */
+            bstr__free (b0);
+            return NULL;
+        }
+    }
 
-	b0->mlen = j;
-	b0->slen = i;
+    b0->mlen = j;
+    b0->slen = i;
 
-	if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
-	b0->data[b0->slen] = (unsigned char) '\0';
+    if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+    b0->data[b0->slen] = (unsigned char) '\0';
 
-	return b0;
+    return b0;
 }
 
 /*  int bassign (bstring a, const_bstring b)
@@ -444,19 +444,19 @@ int i,j;
  *  Overwrite the string a with the contents of string b.
  */
 int bassign (bstring a, const_bstring b) {
-	if (b == NULL || b->data == NULL || b->slen < 0)
-		return BSTR_ERR;
-	if (b->slen != 0) {
-		if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
-		bstr__memmove (a->data, b->data, b->slen);
-	} else {
-		if (a == NULL || a->data == NULL || a->mlen < a->slen || 
-		    a->slen < 0 || a->mlen == 0) 
-			return BSTR_ERR;
-	}
-	a->data[b->slen] = (unsigned char) '\0';
-	a->slen = b->slen;
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
+    if (b->slen != 0) {
+        if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data, b->slen);
+    } else {
+        if (a == NULL || a->data == NULL || a->mlen < a->slen || 
+            a->slen < 0 || a->mlen == 0) 
+            return BSTR_ERR;
+    }
+    a->data[b->slen] = (unsigned char) '\0';
+    a->slen = b->slen;
+    return BSTR_OK;
 }
 
 /*  int bassignmidstr (bstring a, const_bstring b, int left, int len)
@@ -466,29 +466,29 @@ int bassign (bstring a, const_bstring b) {
  *  len are clamped to the ends of b as with the function bmidstr.
  */
 int bassignmidstr (bstring a, const_bstring b, int left, int len) {
-	if (b == NULL || b->data == NULL || b->slen < 0)
-		return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
 
-	if (left < 0) {
-		len += left;
-		left = 0;
-	}
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
 
-	if (len > b->slen - left) len = b->slen - left;
+    if (len > b->slen - left) len = b->slen - left;
 
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0)
-		return BSTR_ERR;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0)
+        return BSTR_ERR;
 
-	if (len > 0) {
-		if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
-		bstr__memmove (a->data, b->data + left, len);
-		a->slen = len;
-	} else {
-		a->slen = 0;
-	}
-	a->data[a->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (len > 0) {
+        if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data + left, len);
+        a->slen = len;
+    } else {
+        a->slen = 0;
+    }
+    a->data[a->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bassigncstr (bstring a, const char * str)
@@ -500,24 +500,24 @@ int bassignmidstr (bstring a, const_bstring b, int left, int len) {
 int bassigncstr (bstring a, const char * str) {
 int i;
 size_t len;
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0 || NULL == str) 
-		return BSTR_ERR;
-
-	for (i=0; i < a->mlen; i++) {
-		if ('\0' == (a->data[i] = str[i])) {
-			a->slen = i;
-			return BSTR_OK;
-		}
-	}
-
-	a->slen = i;
-	len = strlen (str + i);
-	if (len > INT_MAX || i + len + 1 > INT_MAX ||
-	    0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
-	bBlockCopy (a->data + i, str + i, (size_t) len + 1);
-	a->slen += (int) len;
-	return BSTR_OK;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == str) 
+        return BSTR_ERR;
+
+    for (i=0; i < a->mlen; i++) {
+        if ('\0' == (a->data[i] = str[i])) {
+            a->slen = i;
+            return BSTR_OK;
+        }
+    }
+
+    a->slen = i;
+    len = strlen (str + i);
+    if (len > INT_MAX || i + len + 1 > INT_MAX ||
+        0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+    bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+    a->slen += (int) len;
+    return BSTR_OK;
 }
 
 /*  int bassignblk (bstring a, const void * s, int len)
@@ -527,14 +527,14 @@ size_t len;
  *  occurs BSTR_ERR is returned and a is not overwritten.
  */
 int bassignblk (bstring a, const void * s, int len) {
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
-		return BSTR_ERR;
-	if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
-	bBlockCopy (a->data, s, (size_t) len);
-	a->data[len] = (unsigned char) '\0';
-	a->slen = len;
-	return BSTR_OK;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
+        return BSTR_ERR;
+    if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+    bBlockCopy (a->data, s, (size_t) len);
+    a->data[len] = (unsigned char) '\0';
+    a->slen = len;
+    return BSTR_OK;
 }
 
 /*  int btrunc (bstring b, int n)
@@ -542,13 +542,13 @@ int bassignblk (bstring a, const void * s, int len) {
  *  Truncate the bstring to at most n characters.
  */
 int btrunc (bstring b, int n) {
-	if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	if (b->slen > n) {
-		b->slen = n;
-		b->data[n] = (unsigned char) '\0';
-	}
-	return BSTR_OK;
+    if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b->slen > n) {
+        b->slen = n;
+        b->data[n] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
 }
 
 #define   upcase(c) (toupper ((unsigned char) c))
@@ -561,12 +561,12 @@ int btrunc (bstring b, int n) {
  */
 int btoupper (bstring b) {
 int i, len;
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	for (i=0, len = b->slen; i < len; i++) {
-		b->data[i] = (unsigned char) upcase (b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) upcase (b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 /*  int btolower (bstring b)
@@ -575,12 +575,12 @@ int i, len;
  */
 int btolower (bstring b) {
 int i, len;
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	for (i=0, len = b->slen; i < len; i++) {
-		b->data[i] = (unsigned char) downcase (b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) downcase (b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 /*  int bstricmp (const_bstring b0, const_bstring b1)
@@ -595,28 +595,28 @@ int i, len;
 int bstricmp (const_bstring b0, const_bstring b1) {
 int i, v, n;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
-	if ((n = b0->slen) > b1->slen) n = b1->slen;
-	else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
-
-	for (i = 0; i < n; i ++) {
-		v  = (char) downcase (b0->data[i])
-		   - (char) downcase (b1->data[i]);
-		if (0 != v) return v;
-	}
-
-	if (b0->slen > n) {
-		v = (char) downcase (b0->data[n]);
-		if (v) return v;
-		return UCHAR_MAX + 1;
-	}
-	if (b1->slen > n) {
-		v = - (char) downcase (b1->data[n]);
-		if (v) return v;
-		return - (int) (UCHAR_MAX + 1);
-	}
-	return BSTR_OK;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+    if ((n = b0->slen) > b1->slen) n = b1->slen;
+    else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+    for (i = 0; i < n; i ++) {
+        v  = (char) downcase (b0->data[i])
+           - (char) downcase (b1->data[i]);
+        if (0 != v) return v;
+    }
+
+    if (b0->slen > n) {
+        v = (char) downcase (b0->data[n]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
+    if (b1->slen > n) {
+        v = - (char) downcase (b1->data[n]);
+        if (v) return v;
+        return - (int) (UCHAR_MAX + 1);
+    }
+    return BSTR_OK;
 }
 
 /*  int bstrnicmp (const_bstring b0, const_bstring b1, int n)
@@ -632,31 +632,31 @@ int i, v, n;
 int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
 int i, v, m;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
-	m = n;
-	if (m > b0->slen) m = b0->slen;
-	if (m > b1->slen) m = b1->slen;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
 
-	if (b0->data != b1->data) {
-		for (i = 0; i < m; i ++) {
-			v  = (char) downcase (b0->data[i]);
-			v -= (char) downcase (b1->data[i]);
-			if (v != 0) return b0->data[i] - b1->data[i];
-		}
-	}
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v  = (char) downcase (b0->data[i]);
+            v -= (char) downcase (b1->data[i]);
+            if (v != 0) return b0->data[i] - b1->data[i];
+        }
+    }
 
-	if (n == m || b0->slen == b1->slen) return BSTR_OK;
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
 
-	if (b0->slen > m) {
-		v = (char) downcase (b0->data[m]);
-		if (v) return v;
-		return UCHAR_MAX + 1;
-	}
+    if (b0->slen > m) {
+        v = (char) downcase (b0->data[m]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
 
-	v = - (char) downcase (b1->data[m]);
-	if (v) return v;
-	return - (int) (UCHAR_MAX + 1);
+    v = - (char) downcase (b1->data[m]);
+    if (v) return v;
+    return - (int) (UCHAR_MAX + 1);
 }
 
 /*  int biseqcaseless (const_bstring b0, const_bstring b1)
@@ -670,17 +670,17 @@ int i, v, m;
 int biseqcaseless (const_bstring b0, const_bstring b1) {
 int i, n;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
-	if (b0->slen != b1->slen) return BSTR_OK;
-	if (b0->data == b1->data || b0->slen == 0) return 1;
-	for (i=0, n=b0->slen; i < n; i++) {
-		if (b0->data[i] != b1->data[i]) {
-			unsigned char c = (unsigned char) downcase (b0->data[i]);
-			if (c != (unsigned char) downcase (b1->data[i])) return 0;
-		}
-	}
-	return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    for (i=0, n=b0->slen; i < n; i++) {
+        if (b0->data[i] != b1->data[i]) {
+            unsigned char c = (unsigned char) downcase (b0->data[i]);
+            if (c != (unsigned char) downcase (b1->data[i])) return 0;
+        }
+    }
+    return 1;
 }
 
 /*  int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
@@ -695,18 +695,18 @@ int i, n;
 int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
 int i;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-		return BSTR_ERR;
-	if (b0->slen < len) return BSTR_OK;
-	if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
 
-	for (i = 0; i < len; i ++) {
-		if (b0->data[i] != ((const unsigned char *) blk)[i]) {
-			if (downcase (b0->data[i]) != 
-			    downcase (((const unsigned char *) blk)[i])) return 0;
-		}
-	}
-	return 1;
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+            if (downcase (b0->data[i]) != 
+                downcase (((const unsigned char *) blk)[i])) return 0;
+        }
+    }
+    return 1;
 }
 
 /*
@@ -717,18 +717,18 @@ int i;
 int bltrimws (bstring b) {
 int i, len;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (len = b->slen, i = 0; i < len; i++) {
-		if (!wspace (b->data[i])) {
-			return bdelete (b, 0, i);
-		}
-	}
+    for (len = b->slen, i = 0; i < len; i++) {
+        if (!wspace (b->data[i])) {
+            return bdelete (b, 0, i);
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*
@@ -739,20 +739,20 @@ int i, len;
 int brtrimws (bstring b) {
 int i;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (i = b->slen - 1; i >= 0; i--) {
-		if (!wspace (b->data[i])) {
-			if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-			b->slen = i + 1;
-			return BSTR_OK;
-		}
-	}
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            return BSTR_OK;
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*
@@ -763,21 +763,21 @@ int i;
 int btrimws (bstring b) {
 int i, j;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (i = b->slen - 1; i >= 0; i--) {
-		if (!wspace (b->data[i])) {
-			if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-			b->slen = i + 1;
-			for (j = 0; wspace (b->data[j]); j++) {}
-			return bdelete (b, 0, j);
-		}
-	}
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            for (j = 0; wspace (b->data[j]); j++) {}
+            return bdelete (b, 0, j);
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*  int biseq (const_bstring b0, const_bstring b1)
@@ -788,11 +788,11 @@ int i, j;
  *  O(1).  '\0' termination characters are not treated in any special way.
  */
 int biseq (const_bstring b0, const_bstring b1) {
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
-	if (b0->slen != b1->slen) return BSTR_OK;
-	if (b0->data == b1->data || b0->slen == 0) return 1;
-	return !bstr__memcmp (b0->data, b1->data, b0->slen);
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    return !bstr__memcmp (b0->data, b1->data, b0->slen);
 }
 
 /*  int bisstemeqblk (const_bstring b0, const void * blk, int len)
@@ -806,15 +806,15 @@ int biseq (const_bstring b0, const_bstring b1) {
 int bisstemeqblk (const_bstring b0, const void * blk, int len) {
 int i;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-		return BSTR_ERR;
-	if (b0->slen < len) return BSTR_OK;
-	if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
 
-	for (i = 0; i < len; i ++) {
-		if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
-	}
-	return 1;
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+    }
+    return 1;
 }
 
 /*  int biseqcstr (const_bstring b, const char *s)
@@ -830,11 +830,11 @@ int i;
  */
 int biseqcstr (const_bstring b, const char * s) {
 int i;
-	if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-	for (i=0; i < b->slen; i++) {
-		if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
-	}
-	return s[i] == '\0';
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+    }
+    return s[i] == '\0';
 }
 
 /*  int biseqcstrcaseless (const_bstring b, const char *s)
@@ -851,14 +851,14 @@ int i;
  */
 int biseqcstrcaseless (const_bstring b, const char * s) {
 int i;
-	if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-	for (i=0; i < b->slen; i++) {
-		if (s[i] == '\0' || 
-		    (b->data[i] != (unsigned char) s[i] && 
-		     downcase (b->data[i]) != (unsigned char) downcase (s[i])))
-			return BSTR_OK;
-	}
-	return s[i] == '\0';
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || 
+            (b->data[i] != (unsigned char) s[i] && 
+             downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+            return BSTR_OK;
+    }
+    return s[i] == '\0';
 }
 
 /*  int bstrcmp (const_bstring b0, const_bstring b1)
@@ -878,21 +878,21 @@ int i;
 int bstrcmp (const_bstring b0, const_bstring b1) {
 int i, v, n;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-	n = b0->slen; if (n > b1->slen) n = b1->slen;
-	if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
-		return BSTR_OK;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    n = b0->slen; if (n > b1->slen) n = b1->slen;
+    if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+        return BSTR_OK;
 
-	for (i = 0; i < n; i ++) {
-		v = ((char) b0->data[i]) - ((char) b1->data[i]);
-		if (v != 0) return v;
-		if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-	}
+    for (i = 0; i < n; i ++) {
+        v = ((char) b0->data[i]) - ((char) b1->data[i]);
+        if (v != 0) return v;
+        if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+    }
 
-	if (b0->slen > n) return 1;
-	if (b1->slen > n) return -1;
-	return BSTR_OK;
+    if (b0->slen > n) return 1;
+    if (b1->slen > n) return -1;
+    return BSTR_OK;
 }
 
 /*  int bstrncmp (const_bstring b0, const_bstring b1, int n)
@@ -908,24 +908,24 @@ int i, v, n;
 int bstrncmp (const_bstring b0, const_bstring b1, int n) {
 int i, v, m;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-	m = n;
-	if (m > b0->slen) m = b0->slen;
-	if (m > b1->slen) m = b1->slen;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
 
-	if (b0->data != b1->data) {
-		for (i = 0; i < m; i ++) {
-			v = ((char) b0->data[i]) - ((char) b1->data[i]);
-			if (v != 0) return v;
-			if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-		}
-	}
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v = ((char) b0->data[i]) - ((char) b1->data[i]);
+            if (v != 0) return v;
+            if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+        }
+    }
 
-	if (n == m || b0->slen == b1->slen) return BSTR_OK;
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
 
-	if (b0->slen > m) return 1;
-	return -1;
+    if (b0->slen > m) return 1;
+    return -1;
 }
 
 /*  bstring bmidstr (const_bstring b, int left, int len)
@@ -937,17 +937,17 @@ int i, v, m;
  */
 bstring bmidstr (const_bstring b, int left, int len) {
 
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
 
-	if (left < 0) {
-		len += left;
-		left = 0;
-	}
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
 
-	if (len > b->slen - left) len = b->slen - left;
+    if (len > b->slen - left) len = b->slen - left;
 
-	if (len <= 0) return bfromcstr ("");
-	return blk2bstr (b->data + left, len);
+    if (len <= 0) return bfromcstr ("");
+    return blk2bstr (b->data + left, len);
 }
 
 /*  int bdelete (bstring b, int pos, int len)
@@ -958,27 +958,27 @@ bstring bmidstr (const_bstring b, int left, int len) {
  *  len) is clamped to boundaries of the bstring b.
  */
 int bdelete (bstring b, int pos, int len) {
-	/* Clamp to left side of bstring */
-	if (pos < 0) {
-		len += pos;
-		pos = 0;
-	}
-
-	if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
-	    b->mlen < b->slen || b->mlen <= 0) 
-		return BSTR_ERR;
-	if (len > 0 && pos < b->slen) {
-		if (pos + len >= b->slen) {
-			b->slen = pos;
-		} else {
-			bBlockCopy ((char *) (b->data + pos),
-			            (char *) (b->data + pos + len), 
-			            b->slen - (pos+len));
-			b->slen -= len;
-		}
-		b->data[b->slen] = (unsigned char) '\0';
-	}
-	return BSTR_OK;
+    /* Clamp to left side of bstring */
+    if (pos < 0) {
+        len += pos;
+        pos = 0;
+    }
+
+    if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
+        b->mlen < b->slen || b->mlen <= 0) 
+        return BSTR_ERR;
+    if (len > 0 && pos < b->slen) {
+        if (pos + len >= b->slen) {
+            b->slen = pos;
+        } else {
+            bBlockCopy ((char *) (b->data + pos),
+                        (char *) (b->data + pos + len), 
+                        b->slen - (pos+len));
+            b->slen -= len;
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
 }
 
 /*  int bdestroy (bstring b)
@@ -989,21 +989,21 @@ int bdelete (bstring b, int pos, int len) {
  *  been bdestroyed is undefined.
  */
 int bdestroy (bstring b) {
-	if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
-	    b->data == NULL)
-		return BSTR_ERR;
+    if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+        b->data == NULL)
+        return BSTR_ERR;
 
-	bstr__free (b->data);
+    bstr__free (b->data);
 
-	/* In case there is any stale usage, there is one more chance to 
-	   notice this error. */
+    /* In case there is any stale usage, there is one more chance to 
+       notice this error. */
 
-	b->slen = -1;
-	b->mlen = -__LINE__;
-	b->data = NULL;
+    b->slen = -1;
+    b->mlen = -__LINE__;
+    b->data = NULL;
 
-	bstr__free (b);
-	return BSTR_OK;
+    bstr__free (b);
+    return BSTR_OK;
 }
 
 /*  int binstr (const_bstring b1, int pos, const_bstring b2)
@@ -1023,74 +1023,74 @@ register unsigned char * d1;
 register unsigned char c1;
 register int i;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* No space to find such a string? */
-	if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+    /* No space to find such a string? */
+    if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
 
-	/* An obvious alias case */
-	if (b1->data == b2->data && pos == 0) return 0;
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return 0;
 
-	i = pos;
+    i = pos;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	ll = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
 
-	/* Peel off the b2->slen == 1 case */
-	c0 = d0[0];
-	if (1 == ll) {
-		for (;i < lf; i++) if (c0 == d1[i]) return i;
-		return BSTR_ERR;
-	}
+    /* Peel off the b2->slen == 1 case */
+    c0 = d0[0];
+    if (1 == ll) {
+        for (;i < lf; i++) if (c0 == d1[i]) return i;
+        return BSTR_ERR;
+    }
 
-	c1 = c0;
-	j = 0;
-	lf = b1->slen - 1;
+    c1 = c0;
+    j = 0;
+    lf = b1->slen - 1;
 
-	ii = -1;
-	if (i < lf) do {
-		/* Unrolled current character test */
-		if (c1 != d1[i]) {
-			if (c1 != d1[1+i]) {
-				i += 2;
-				continue;
-			}
-			i++;
-		}
+    ii = -1;
+    if (i < lf) do {
+        /* Unrolled current character test */
+        if (c1 != d1[i]) {
+            if (c1 != d1[1+i]) {
+                i += 2;
+                continue;
+            }
+            i++;
+        }
 
-		/* Take note if this is the start of a potential match */
-		if (0 == j) ii = i;
+        /* Take note if this is the start of a potential match */
+        if (0 == j) ii = i;
 
-		/* Shift the test character down by one */
-		j++;
-		i++;
+        /* Shift the test character down by one */
+        j++;
+        i++;
 
-		/* If this isn't past the last character continue */
-		if (j < ll) {
-			c1 = d0[j];
-			continue;
-		}
+        /* If this isn't past the last character continue */
+        if (j < ll) {
+            c1 = d0[j];
+            continue;
+        }
 
-		N0:;
+        N0:;
 
-		/* If no characters mismatched, then we matched */
-		if (i == ii+j) return ii;
+        /* If no characters mismatched, then we matched */
+        if (i == ii+j) return ii;
 
-		/* Shift back to the beginning */
-		i -= j;
-		j  = 0;
-		c1 = c0;
-	} while (i < lf);
+        /* Shift back to the beginning */
+        i -= j;
+        j  = 0;
+        c1 = c0;
+    } while (i < lf);
 
-	/* Deal with last case if unrolling caused a misalignment */
-	if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+    /* Deal with last case if unrolling caused a misalignment */
+    if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrr (const_bstring b1, int pos, const_bstring b2)
@@ -1106,38 +1106,38 @@ int binstrr (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos && b2->slen == 0) return pos;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* Obvious alias case */
-	if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
 
-	i = pos;
-	if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
 
-	/* If no space to find such a string then snap back */
-	if (l + 1 <= i) i = l;
-	j = 0;
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	l  = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j]) {
-			j ++;
-			if (j >= l) return i;
-		} else {
-			i --;
-			if (i < 0) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j]) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1153,39 +1153,39 @@ int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l, ll;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	l = b1->slen - b2->slen + 1;
+    l = b1->slen - b2->slen + 1;
 
-	/* No space to find such a string? */
-	if (l <= pos) return BSTR_ERR;
+    /* No space to find such a string? */
+    if (l <= pos) return BSTR_ERR;
 
-	/* An obvious alias case */
-	if (b1->data == b2->data && pos == 0) return BSTR_OK;
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return BSTR_OK;
 
-	i = pos;
-	j = 0;
+    i = pos;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	ll = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-			j ++;
-			if (j >= ll) return i;
-		} else {
-			i ++;
-			if (i >= l) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= ll) return i;
+        } else {
+            i ++;
+            if (i >= l) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1201,38 +1201,38 @@ int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos && b2->slen == 0) return pos;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* Obvious alias case */
-	if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
 
-	i = pos;
-	if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
 
-	/* If no space to find such a string then snap back */
-	if (l + 1 <= i) i = l;
-	j = 0;
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	l  = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-			j ++;
-			if (j >= l) return i;
-		} else {
-			i --;
-			if (i < 0) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 
@@ -1244,10 +1244,10 @@ unsigned char * d0, * d1;
 int bstrchrp (const_bstring b, int c, int pos) {
 unsigned char * p;
 
-	if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-	p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
-	if (p) return (int) (p - b->data);
-	return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+    if (p) return (int) (p - b->data);
+    return BSTR_ERR;
 }
 
 /*  int bstrrchrp (const_bstring b, int c, int pos)
@@ -1258,11 +1258,11 @@ unsigned char * p;
 int bstrrchrp (const_bstring b, int c, int pos) {
 int i;
  
-	if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-	for (i=pos; i >= 0; i--) {
-		if (b->data[i] == (unsigned char) c) return i;
-	}
-	return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    for (i=pos; i >= 0; i--) {
+        if (b->data[i] == (unsigned char) c) return i;
+    }
+    return BSTR_ERR;
 }
 
 #if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
@@ -1274,8 +1274,8 @@ int i;
 struct charField { LONG_TYPE content[CFCLEN]; };
 #define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
 #define setInCharField(cf,idx) { \
-	unsigned int c = (unsigned int) (idx); \
-	(cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+    unsigned int c = (unsigned int) (idx); \
+    (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
 }
 
 #else
@@ -1290,27 +1290,27 @@ struct charField { unsigned char content[CFCLEN]; };
 /* Convert a bstring to charField */
 static int buildCharField (struct charField * cf, const_bstring b) {
 int i;
-	if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
-	memset ((void *) cf->content, 0, sizeof (struct charField));
-	for (i=0; i < b->slen; i++) {
-		setInCharField (cf, b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+    memset ((void *) cf->content, 0, sizeof (struct charField));
+    for (i=0; i < b->slen; i++) {
+        setInCharField (cf, b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 static void invertCharField (struct charField * cf) {
 int i;
-	for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+    for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
 }
 
 /* Inner engine for binchr */
 static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
 int i;
-	for (i=pos; i < len; i++) {
-		unsigned char c = (unsigned char) data[i];
-		if (testInCharField (cf, c)) return i;
-	}
-	return BSTR_ERR;
+    for (i=pos; i < len; i++) {
+        unsigned char c = (unsigned char) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
 }
 
 /*  int binchr (const_bstring b0, int pos, const_bstring b1);
@@ -1321,21 +1321,21 @@ int i;
  */
 int binchr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL ||
-	    b0->slen <= pos) return BSTR_ERR;
-	if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
-	if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-	return binchrCF (b0->data, b0->slen, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL ||
+        b0->slen <= pos) return BSTR_ERR;
+    if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
 }
 
 /* Inner engine for binchrr */
 static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
 int i;
-	for (i=pos; i >= 0; i--) {
-		unsigned int c = (unsigned int) data[i];
-		if (testInCharField (cf, c)) return i;
-	}
-	return BSTR_ERR;
+    for (i=pos; i >= 0; i--) {
+        unsigned int c = (unsigned int) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
 }
 
 /*  int binchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1346,12 +1346,12 @@ int i;
  */
 int binchrr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
-	    b0->slen < pos) return BSTR_ERR;
-	if (pos == b0->slen) pos--;
-	if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
-	if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-	return binchrrCF (b0->data, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrrCF (b0->data, pos, &chrs);
 }
 
 /*  int bninchr (const_bstring b0, int pos, const_bstring b1);
@@ -1362,11 +1362,11 @@ struct charField chrs;
  */
 int bninchr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || 
-	    b0->slen <= pos) return BSTR_ERR;
-	if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-	invertCharField (&chrs);
-	return binchrCF (b0->data, b0->slen, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen <= pos) return BSTR_ERR;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
 }
 
 /*  int bninchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1377,12 +1377,12 @@ struct charField chrs;
  */
 int bninchrr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || 
-	    b0->slen < pos) return BSTR_ERR;
-	if (pos == b0->slen) pos--;
-	if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-	invertCharField (&chrs);
-	return binchrrCF (b0->data, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrrCF (b0->data, pos, &chrs);
 }
 
 /*  int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
@@ -1397,47 +1397,47 @@ int d, newlen;
 ptrdiff_t pd;
 bstring aux = (bstring) b1;
 
-	if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
-	    b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
-	if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+    if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
+        b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+    if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
 
-	d = pos;
+    d = pos;
 
-	/* Aliasing case */
-	if (NULL != aux) {
-		if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
-			if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-		}
-		d += aux->slen;
-	}
+    /* Aliasing case */
+    if (NULL != aux) {
+        if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        d += aux->slen;
+    }
 
-	/* Increase memory size if necessary */
-	if (balloc (b0, d + 1) != BSTR_OK) {
-		if (aux != b1) bdestroy (aux);
-		return BSTR_ERR;
-	}
+    /* Increase memory size if necessary */
+    if (balloc (b0, d + 1) != BSTR_OK) {
+        if (aux != b1) bdestroy (aux);
+        return BSTR_ERR;
+    }
 
-	newlen = b0->slen;
+    newlen = b0->slen;
 
-	/* Fill in "fill" character as necessary */
-	if (pos > newlen) {
-		bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
-		newlen = pos;
-	}
+    /* Fill in "fill" character as necessary */
+    if (pos > newlen) {
+        bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+        newlen = pos;
+    }
 
-	/* Copy b1 to position pos in b0. */
-	if (aux != NULL) {
-		bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
-		if (aux != b1) bdestroy (aux);
-	}
+    /* Copy b1 to position pos in b0. */
+    if (aux != NULL) {
+        bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+        if (aux != b1) bdestroy (aux);
+    }
 
-	/* Indicate the potentially increased size of b0 */
-	if (d > newlen) newlen = d;
+    /* Indicate the potentially increased size of b0 */
+    if (d > newlen) newlen = d;
 
-	b0->slen = newlen;
-	b0->data[newlen] = (unsigned char) '\0';
+    b0->slen = newlen;
+    b0->data[newlen] = (unsigned char) '\0';
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
@@ -1452,40 +1452,40 @@ int d, l;
 ptrdiff_t pd;
 bstring aux = (bstring) b2;
 
-	if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
-	    b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
-
-	/* Aliasing case */
-	if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
-		if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-	}
-
-	/* Compute the two possible end pointers */
-	d = b1->slen + aux->slen;
-	l = pos + aux->slen;
-	if ((d|l) < 0) return BSTR_ERR;
-
-	if (l > d) {
-		/* Inserting past the end of the string */
-		if (balloc (b1, l + 1) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-		bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
-		b1->slen = l;
-	} else {
-		/* Inserting in the middle of the string */
-		if (balloc (b1, d + 1) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-		bBlockCopy (b1->data + l, b1->data + pos, d - l);
-		b1->slen = d;
-	}
-	bBlockCopy (b1->data + pos, aux->data, aux->slen);
-	b1->data[b1->slen] = (unsigned char) '\0';
-	if (aux != b2) bdestroy (aux);
-	return BSTR_OK;
+    if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
+        b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    /* Compute the two possible end pointers */
+    d = b1->slen + aux->slen;
+    l = pos + aux->slen;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b1, l + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+        b1->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b1, d + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bBlockCopy (b1->data + l, b1->data + pos, d - l);
+        b1->slen = d;
+    }
+    bBlockCopy (b1->data + pos, aux->data, aux->slen);
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int breplace (bstring b1, int pos, int len, bstring b2, 
@@ -1495,44 +1495,44 @@ bstring aux = (bstring) b2;
  *  fill is used is pos > b1->slen.
  */
 int breplace (bstring b1, int pos, int len, const_bstring b2, 
-			  unsigned char fill) {
+              unsigned char fill) {
 int pl, ret;
 ptrdiff_t pd;
 bstring aux = (bstring) b2;
 
-	if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
-	    b2 == NULL || b1->data == NULL || b2->data == NULL || 
-	    b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
-	    b1->mlen <= 0) return BSTR_ERR;
-
-	/* Straddles the end? */
-	if (pl >= b1->slen) {
-		if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
-		if (pos + b2->slen < b1->slen) {
-			b1->slen = pos + b2->slen;
-			b1->data[b1->slen] = (unsigned char) '\0';
-		}
-		return ret;
-	}
-
-	/* Aliasing case */
-	if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
-		if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-	}
-
-	if (aux->slen > len) {
-		if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-	}
-
-	if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
-	bstr__memcpy (b1->data + pos, aux->data, aux->slen);
-	b1->slen += aux->slen - len;
-	b1->data[b1->slen] = (unsigned char) '\0';
-	if (aux != b2) bdestroy (aux);
-	return BSTR_OK;
+    if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
+        b2 == NULL || b1->data == NULL || b2->data == NULL || 
+        b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+        b1->mlen <= 0) return BSTR_ERR;
+
+    /* Straddles the end? */
+    if (pl >= b1->slen) {
+        if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+        if (pos + b2->slen < b1->slen) {
+            b1->slen = pos + b2->slen;
+            b1->data[b1->slen] = (unsigned char) '\0';
+        }
+        return ret;
+    }
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    if (aux->slen > len) {
+        if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
+
+    if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+    bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+    b1->slen += aux->slen - len;
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
@@ -1552,123 +1552,123 @@ ptrdiff_t pd;
 bstring auxf = (bstring) find;
 bstring auxr = (bstring) repl;
 
-	if (b == NULL || b->data == NULL || find == NULL ||
-	    find->data == NULL || repl == NULL || repl->data == NULL || 
-	    pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
-	    b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
-	if (pos > b->slen - find->slen) return BSTR_OK;
-
-	/* Alias with find string */
-	pd = (ptrdiff_t) (find->data - b->data);
-	if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
-		if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
-	}
-
-	/* Alias with repl string */
-	pd = (ptrdiff_t) (repl->data - b->data);
-	if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
-		if (NULL == (auxr = bstrcpy (repl))) {
-			if (auxf != find) bdestroy (auxf);
-			return BSTR_ERR;
-		}
-	}
-
-	delta = auxf->slen - auxr->slen;
-
-	/* in-place replacement since find and replace strings are of equal 
-	   length */
-	if (delta == 0) {
-		while ((pos = instr (b, pos, auxf)) >= 0) {
-			bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
-			pos += auxf->slen;
-		}
-		if (auxf != find) bdestroy (auxf);
-		if (auxr != repl) bdestroy (auxr);
-		return BSTR_OK;
-	}
-
-	/* shrinking replacement since auxf->slen > auxr->slen */
-	if (delta > 0) {
-		acc = 0;
-
-		while ((i = instr (b, pos, auxf)) >= 0) {
-			if (acc && i > pos)
-				bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-			if (auxr->slen)
-				bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
-			acc += delta;
-			pos = i + auxf->slen;
-		}
-
-		if (acc) {
-			i = b->slen;
-			if (i > pos)
-				bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-			b->slen -= acc;
-			b->data[b->slen] = (unsigned char) '\0';
-		}
-
-		if (auxf != find) bdestroy (auxf);
-		if (auxr != repl) bdestroy (auxr);
-		return BSTR_OK;
-	}
-
-	/* expanding replacement since find->slen < repl->slen.  Its a lot 
-	   more complicated. */
-
-	mlen = 32;
-	d = (int *) static_d; /* Avoid malloc for trivial cases */
-	acc = slen = 0;
-
-	while ((pos = instr (b, pos, auxf)) >= 0) {
-		if (slen + 1 >= mlen) {
-			int sl;
-			int * t;
-			mlen += mlen;
-			sl = sizeof (int *) * mlen;
-			if (static_d == d) d = NULL;
-			if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
-				ret = BSTR_ERR;
-				goto done;
-			}
-			if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
-			d = t;
-		}
-		d[slen] = pos;
-		slen++;
-		acc -= delta;
-		pos += auxf->slen;
-		if (pos < 0 || acc < 0) {
-			ret = BSTR_ERR;
-			goto done;
-		}
-	}
-	d[slen] = b->slen;
-
-	if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
-		b->slen += acc;
-		for (i = slen-1; i >= 0; i--) {
-			int s, l;
-			s = d[i] + auxf->slen;
-			l = d[i+1] - s;
-			if (l) {
-				bstr__memmove (b->data + s + acc, b->data + s, l);
-			}
-			if (auxr->slen) {
-				bstr__memmove (b->data + s + acc - auxr->slen, 
-				         auxr->data, auxr->slen);
-			}
-			acc += delta;		
-		}
-		b->data[b->slen] = (unsigned char) '\0';
-	}
-
-	done:;
-	if (static_d == d) d = NULL;
-	bstr__free (d);
-	if (auxf != find) bdestroy (auxf);
-	if (auxr != repl) bdestroy (auxr);
-	return ret;
+    if (b == NULL || b->data == NULL || find == NULL ||
+        find->data == NULL || repl == NULL || repl->data == NULL || 
+        pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
+        b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+    if (pos > b->slen - find->slen) return BSTR_OK;
+
+    /* Alias with find string */
+    pd = (ptrdiff_t) (find->data - b->data);
+    if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+    }
+
+    /* Alias with repl string */
+    pd = (ptrdiff_t) (repl->data - b->data);
+    if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxr = bstrcpy (repl))) {
+            if (auxf != find) bdestroy (auxf);
+            return BSTR_ERR;
+        }
+    }
+
+    delta = auxf->slen - auxr->slen;
+
+    /* in-place replacement since find and replace strings are of equal 
+       length */
+    if (delta == 0) {
+        while ((pos = instr (b, pos, auxf)) >= 0) {
+            bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+            pos += auxf->slen;
+        }
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* shrinking replacement since auxf->slen > auxr->slen */
+    if (delta > 0) {
+        acc = 0;
+
+        while ((i = instr (b, pos, auxf)) >= 0) {
+            if (acc && i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            if (auxr->slen)
+                bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+            acc += delta;
+            pos = i + auxf->slen;
+        }
+
+        if (acc) {
+            i = b->slen;
+            if (i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            b->slen -= acc;
+            b->data[b->slen] = (unsigned char) '\0';
+        }
+
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* expanding replacement since find->slen < repl->slen.  Its a lot 
+       more complicated. */
+
+    mlen = 32;
+    d = (int *) static_d; /* Avoid malloc for trivial cases */
+    acc = slen = 0;
+
+    while ((pos = instr (b, pos, auxf)) >= 0) {
+        if (slen + 1 >= mlen) {
+            int sl;
+            int * t;
+            mlen += mlen;
+            sl = sizeof (int *) * mlen;
+            if (static_d == d) d = NULL;
+            if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+                ret = BSTR_ERR;
+                goto done;
+            }
+            if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+            d = t;
+        }
+        d[slen] = pos;
+        slen++;
+        acc -= delta;
+        pos += auxf->slen;
+        if (pos < 0 || acc < 0) {
+            ret = BSTR_ERR;
+            goto done;
+        }
+    }
+    d[slen] = b->slen;
+
+    if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+        b->slen += acc;
+        for (i = slen-1; i >= 0; i--) {
+            int s, l;
+            s = d[i] + auxf->slen;
+            l = d[i+1] - s;
+            if (l) {
+                bstr__memmove (b->data + s + acc, b->data + s, l);
+            }
+            if (auxr->slen) {
+                bstr__memmove (b->data + s + acc - auxr->slen, 
+                         auxr->data, auxr->slen);
+            }
+            acc += delta;        
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+
+    done:;
+    if (static_d == d) d = NULL;
+    bstr__free (d);
+    if (auxf != find) bdestroy (auxf);
+    if (auxr != repl) bdestroy (auxr);
+    return ret;
 }
 
 /*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
@@ -1678,7 +1678,7 @@ bstring auxr = (bstring) repl;
  *  given point in a bstring.
  */
 int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
-	return findreplaceengine (b, find, repl, pos, binstr);
+    return findreplaceengine (b, find, repl, pos, binstr);
 }
 
 /*  int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, 
@@ -1688,7 +1688,7 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
  *  string after a given point in a bstring.
  */
 int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
-	return findreplaceengine (b, find, repl, pos, binstrcaseless);
+    return findreplaceengine (b, find, repl, pos, binstrcaseless);
 }
 
 /*  int binsertch (bstring b, int pos, int len, unsigned char fill)
@@ -1701,31 +1701,31 @@ int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int
 int binsertch (bstring b, int pos, int len, unsigned char fill) {
 int d, l, i;
 
-	if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || len < 0) return BSTR_ERR;
-
-	/* Compute the two possible end pointers */
-	d = b->slen + len;
-	l = pos + len;
-	if ((d|l) < 0) return BSTR_ERR;
-
-	if (l > d) {
-		/* Inserting past the end of the string */
-		if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
-		pos = b->slen;
-		b->slen = l;
-	} else {
-		/* Inserting in the middle of the string */
-		if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
-		for (i = d - 1; i >= l; i--) {
-			b->data[i] = b->data[i - len];
-		}
-		b->slen = d;
-	}
-
-	for (i=pos; i < l; i++) b->data[i] = fill;
-	b->data[b->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+    /* Compute the two possible end pointers */
+    d = b->slen + len;
+    l = pos + len;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+        pos = b->slen;
+        b->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+        for (i = d - 1; i >= l; i--) {
+            b->data[i] = b->data[i - len];
+        }
+        b->slen = d;
+    }
+
+    for (i=pos; i < l; i++) b->data[i] = fill;
+    b->data[b->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bpattern (bstring b, int len)
@@ -1738,15 +1738,15 @@ int d, l, i;
 int bpattern (bstring b, int len) {
 int i, d;
 
-	d = blength (b);
-	if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
-	if (len > 0) {
-		if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
-		for (i = d; i < len; i++) b->data[i] = b->data[i - d];
-	}
-	b->data[len] = (unsigned char) '\0';
-	b->slen = len;
-	return BSTR_OK;
+    d = blength (b);
+    if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+    if (len > 0) {
+        if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+        for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+    }
+    b->data[len] = (unsigned char) '\0';
+    b->slen = len;
+    return BSTR_OK;
 }
 
 #define BS_BUFF_SZ (1024)
@@ -1760,20 +1760,20 @@ int i, d;
 int breada (bstring b, bNread readPtr, void * parm) {
 int i, l, n;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
 
-	i = b->slen;
-	for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
-		if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
-		l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
-		i += l;
-		b->slen = i;
-		if (i < n) break;
-	}
+    i = b->slen;
+    for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+        if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+        l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+        i += l;
+        b->slen = i;
+        if (i < n) break;
+    }
 
-	b->data[i] = (unsigned char) '\0';
-	return BSTR_OK;
+    b->data[i] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  bstring bread (bNread readPtr, void * parm)
@@ -1785,11 +1785,11 @@ int i, l, n;
 bstring bread (bNread readPtr, void * parm) {
 bstring buff;
 
-	if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
-		bdestroy (buff);
-		return NULL;
-	}
-	return buff;
+    if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+        bdestroy (buff);
+        return NULL;
+    }
+    return buff;
 }
 
 /*  int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1808,26 +1808,26 @@ bstring buff;
 int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
 int c, d, e;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-	d = 0;
-	e = b->mlen - 2;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = 0;
+    e = b->mlen - 2;
 
-	while ((c = getcPtr (parm)) >= 0) {
-		if (d > e) {
-			b->slen = d;
-			if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-			e = b->mlen - 2;
-		}
-		b->data[d] = (unsigned char) c;
-		d++;
-		if (c == terminator) break;
-	}
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
 
-	b->data[d] = (unsigned char) '\0';
-	b->slen = d;
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
 
-	return d == 0 && c < 0;
+    return d == 0 && c < 0;
 }
 
 /*  int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1846,26 +1846,26 @@ int c, d, e;
 int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
 int c, d, e;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-	d = b->slen;
-	e = b->mlen - 2;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = b->slen;
+    e = b->mlen - 2;
 
-	while ((c = getcPtr (parm)) >= 0) {
-		if (d > e) {
-			b->slen = d;
-			if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-			e = b->mlen - 2;
-		}
-		b->data[d] = (unsigned char) c;
-		d++;
-		if (c == terminator) break;
-	}
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
 
-	b->data[d] = (unsigned char) '\0';
-	b->slen = d;
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
 
-	return d == 0 && c < 0;
+    return d == 0 && c < 0;
 }
 
 /*  bstring bgets (bNgetc getcPtr, void * parm, char terminator)
@@ -1882,19 +1882,19 @@ int c, d, e;
 bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
 bstring buff;
 
-	if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
-		bdestroy (buff);
-		buff = NULL;
-	}
-	return buff;
+    if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+        bdestroy (buff);
+        buff = NULL;
+    }
+    return buff;
 }
 
 struct bStream {
-	bstring buff;		/* Buffer for over-reads */
-	void * parm;		/* The stream handle for core stream */
-	bNread readFnPtr;	/* fread compatible fnptr for core stream */
-	int isEOF;		/* track file's EOF state */
-	int maxBuffSz;
+    bstring buff;        /* Buffer for over-reads */
+    void * parm;        /* The stream handle for core stream */
+    bNread readFnPtr;    /* fread compatible fnptr for core stream */
+    int isEOF;        /* track file's EOF state */
+    int maxBuffSz;
 };
 
 /*  struct bStream * bsopen (bNread readPtr, void * parm)
@@ -1906,15 +1906,15 @@ struct bStream {
 struct bStream * bsopen (bNread readPtr, void * parm) {
 struct bStream * s;
 
-	if (readPtr == NULL) return NULL;
-	s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
-	if (s == NULL) return NULL;
-	s->parm = parm;
-	s->buff = bfromcstr ("");
-	s->readFnPtr = readPtr;
-	s->maxBuffSz = BS_BUFF_SZ;
-	s->isEOF = 0;
-	return s;
+    if (readPtr == NULL) return NULL;
+    s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+    if (s == NULL) return NULL;
+    s->parm = parm;
+    s->buff = bfromcstr ("");
+    s->readFnPtr = readPtr;
+    s->maxBuffSz = BS_BUFF_SZ;
+    s->isEOF = 0;
+    return s;
 }
 
 /*  int bsbufflength (struct bStream * s, int sz)
@@ -1924,15 +1924,15 @@ struct bStream * s;
  */
 int bsbufflength (struct bStream * s, int sz) {
 int oldSz;
-	if (s == NULL || sz < 0) return BSTR_ERR;
-	oldSz = s->maxBuffSz;
-	if (sz > 0) s->maxBuffSz = sz;
-	return oldSz;
+    if (s == NULL || sz < 0) return BSTR_ERR;
+    oldSz = s->maxBuffSz;
+    if (sz > 0) s->maxBuffSz = sz;
+    return oldSz;
 }
 
 int bseof (const struct bStream * s) {
-	if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
-	return s->isEOF && (s->buff->slen == 0);
+    if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+    return s->isEOF && (s->buff->slen == 0);
 }
 
 /*  void * bsclose (struct bStream * s)
@@ -1942,15 +1942,15 @@ int bseof (const struct bStream * s) {
  */
 void * bsclose (struct bStream * s) {
 void * parm;
-	if (s == NULL) return NULL;
-	s->readFnPtr = NULL;
-	if (s->buff) bdestroy (s->buff);
-	s->buff = NULL;
-	parm = s->parm;
-	s->parm = NULL;
-	s->isEOF = 1;
-	bstr__free (s);
-	return parm;
+    if (s == NULL) return NULL;
+    s->readFnPtr = NULL;
+    if (s->buff) bdestroy (s->buff);
+    s->buff = NULL;
+    parm = s->parm;
+    s->parm = NULL;
+    s->isEOF = 1;
+    bstr__free (s);
+    return parm;
 }
 
 /*  int bsreadlna (bstring r, struct bStream * s, char terminator)
@@ -1965,56 +1965,56 @@ int i, l, ret, rlo;
 char * b;
 struct tagbstring x;
 
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
-	    r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
-	l = s->buff->slen;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (char *) s->buff->data;
-	x.data = (unsigned char *) b;
-
-	/* First check if the current buffer holds the terminator */
-	b[l] = terminator; /* Set sentinel */
-	for (i=0; b[i] != terminator; i++) ;
-	if (i < l) {
-		x.slen = i + 1;
-		ret = bconcat (r, &x);
-		s->buff->slen = l;
-		if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-		return BSTR_OK;
-	}
-
-	rlo = r->slen;
-
-	/* If not then just concatenate the entire buffer to the output */
-	x.slen = l;
-	if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-	/* Perform direct in-place reads into the destination to allow for
-	   the minimum of data-copies */
-	for (;;) {
-		if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-		b = (char *) (r->data + r->slen);
-		l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-		if (l <= 0) {
-			r->data[r->slen] = (unsigned char) '\0';
-			s->buff->slen = 0;
-			s->isEOF = 1;
-			/* If nothing was read return with an error message */
-			return BSTR_ERR & -(r->slen == rlo);
-		}
-		b[l] = terminator; /* Set sentinel */
-		for (i=0; b[i] != terminator; i++) ;
-		if (i < l) break;
-		r->slen += l;
-	}
-
-	/* Terminator found, push over-read back to buffer */
-	i++;
-	r->slen += i;
-	s->buff->slen = l - i;
-	bstr__memcpy (s->buff->data, b + i, l - i);
-	r->data[r->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+        r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = terminator; /* Set sentinel */
+    for (i=0; b[i] != terminator; i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+        b[l] = terminator; /* Set sentinel */
+        for (i=0; b[i] != terminator; i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bsreadlnsa (bstring r, struct bStream * s, bstring term)
@@ -2030,61 +2030,61 @@ unsigned char * b;
 struct tagbstring x;
 struct charField cf;
 
-	if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
-	    term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
-	    r->mlen < r->slen) return BSTR_ERR;
-	if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
-	if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
-
-	l = s->buff->slen;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (unsigned char *) s->buff->data;
-	x.data = b;
-
-	/* First check if the current buffer holds the terminator */
-	b[l] = term->data[0]; /* Set sentinel */
-	for (i=0; !testInCharField (&cf, b[i]); i++) ;
-	if (i < l) {
-		x.slen = i + 1;
-		ret = bconcat (r, &x);
-		s->buff->slen = l;
-		if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-		return BSTR_OK;
-	}
-
-	rlo = r->slen;
-
-	/* If not then just concatenate the entire buffer to the output */
-	x.slen = l;
-	if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-	/* Perform direct in-place reads into the destination to allow for
-	   the minimum of data-copies */
-	for (;;) {
-		if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-		b = (unsigned char *) (r->data + r->slen);
-		l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-		if (l <= 0) {
-			r->data[r->slen] = (unsigned char) '\0';
-			s->buff->slen = 0;
-			s->isEOF = 1;
-			/* If nothing was read return with an error message */
-			return BSTR_ERR & -(r->slen == rlo);
-		}
-
-		b[l] = term->data[0]; /* Set sentinel */
-		for (i=0; !testInCharField (&cf, b[i]); i++) ;
-		if (i < l) break;
-		r->slen += l;
-	}
-
-	/* Terminator found, push over-read back to buffer */
-	i++;
-	r->slen += i;
-	s->buff->slen = l - i;
-	bstr__memcpy (s->buff->data, b + i, l - i);
-	r->data[r->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+        term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+        r->mlen < r->slen) return BSTR_ERR;
+    if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+    if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (unsigned char *) s->buff->data;
+    x.data = b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = term->data[0]; /* Set sentinel */
+    for (i=0; !testInCharField (&cf, b[i]); i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (unsigned char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+
+        b[l] = term->data[0]; /* Set sentinel */
+        for (i=0; !testInCharField (&cf, b[i]); i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bsreada (bstring r, struct bStream * s, int n)
@@ -2100,56 +2100,56 @@ int l, ret, orslen;
 char * b;
 struct tagbstring x;
 
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-	 || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
 
-	n += r->slen;
-	if (n <= 0) return BSTR_ERR;
+    n += r->slen;
+    if (n <= 0) return BSTR_ERR;
 
-	l = s->buff->slen;
+    l = s->buff->slen;
 
-	orslen = r->slen;
+    orslen = r->slen;
 
-	if (0 == l) {
-		if (s->isEOF) return BSTR_ERR;
-		if (r->mlen > n) {
-			l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
-			if (0 >= l || l > n - r->slen) {
-				s->isEOF = 1;
-				return BSTR_ERR;
-			}
-			r->slen += l;
-			r->data[r->slen] = (unsigned char) '\0';
-			return 0;
-		}
-	}
+    if (0 == l) {
+        if (s->isEOF) return BSTR_ERR;
+        if (r->mlen > n) {
+            l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+            if (0 >= l || l > n - r->slen) {
+                s->isEOF = 1;
+                return BSTR_ERR;
+            }
+            r->slen += l;
+            r->data[r->slen] = (unsigned char) '\0';
+            return 0;
+        }
+    }
 
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (char *) s->buff->data;
-	x.data = (unsigned char *) b;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
 
-	do {
-		if (l + r->slen >= n) {
-			x.slen = n - r->slen;
-			ret = bconcat (r, &x);
-			s->buff->slen = l;
-			if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
-			return BSTR_ERR & -(r->slen == orslen);
-		}
+    do {
+        if (l + r->slen >= n) {
+            x.slen = n - r->slen;
+            ret = bconcat (r, &x);
+            s->buff->slen = l;
+            if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+            return BSTR_ERR & -(r->slen == orslen);
+        }
 
-		x.slen = l;
-		if (BSTR_OK != bconcat (r, &x)) break;
+        x.slen = l;
+        if (BSTR_OK != bconcat (r, &x)) break;
 
-		l = n - r->slen;
-		if (l > s->maxBuffSz) l = s->maxBuffSz;
+        l = n - r->slen;
+        if (l > s->maxBuffSz) l = s->maxBuffSz;
 
-		l = (int) s->readFnPtr (b, 1, l, s->parm);
+        l = (int) s->readFnPtr (b, 1, l, s->parm);
 
-	} while (l > 0);
-	if (l < 0) l = 0;
-	if (l == 0) s->isEOF = 1;
-	s->buff->slen = l;
-	return BSTR_ERR & -(r->slen == orslen);
+    } while (l > 0);
+    if (l < 0) l = 0;
+    if (l == 0) s->isEOF = 1;
+    s->buff->slen = l;
+    return BSTR_ERR & -(r->slen == orslen);
 }
 
 /*  int bsreadln (bstring r, struct bStream * s, char terminator)
@@ -2160,11 +2160,11 @@ struct tagbstring x;
  *  returned, but will be retained for subsequent read operations.
  */
 int bsreadln (bstring r, struct bStream * s, char terminator) {
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
-		return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreadlna (r, s, terminator);
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+        return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlna (r, s, terminator);
 }
 
 /*  int bsreadlns (bstring r, struct bStream * s, bstring term)
@@ -2175,13 +2175,13 @@ int bsreadln (bstring r, struct bStream * s, char terminator) {
  *  are not returned, but will be retained for subsequent read operations.
  */
 int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
-	if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
-	 || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
-	if (term->slen == 1) return bsreadln (r, s, term->data[0]);
-	if (term->slen < 1) return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreadlnsa (r, s, term);
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
+     || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+    if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+    if (term->slen < 1) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlnsa (r, s, term);
 }
 
 /*  int bsread (bstring r, struct bStream * s, int n)
@@ -2193,11 +2193,11 @@ int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
  *  additional characters from the core stream beyond virtual stream pointer.
  */
 int bsread (bstring r, struct bStream * s, int n) {
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-	 || n <= 0) return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreada (r, s, n);
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || n <= 0) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreada (r, s, n);
 }
 
 /*  int bsunread (struct bStream * s, const_bstring b)
@@ -2207,8 +2207,8 @@ int bsread (bstring r, struct bStream * s, int n) {
  *  stream.
  */
 int bsunread (struct bStream * s, const_bstring b) {
-	if (s == NULL || s->buff == NULL) return BSTR_ERR;
-	return binsert (s->buff, 0, b, (unsigned char) '?');
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return binsert (s->buff, 0, b, (unsigned char) '?');
 }
 
 /*  int bspeek (bstring r, const struct bStream * s)
@@ -2217,8 +2217,8 @@ int bsunread (struct bStream * s, const_bstring b) {
  *  read prior to reads from the core stream.
  */
 int bspeek (bstring r, const struct bStream * s) {
-	if (s == NULL || s->buff == NULL) return BSTR_ERR;
-	return bassign (r, s->buff);
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return bassign (r, s->buff);
 }
 
 /*  bstring bjoin (const struct bstrList * bl, const_bstring sep);
@@ -2231,46 +2231,46 @@ bstring bjoin (const struct bstrList * bl, const_bstring sep) {
 bstring b;
 int i, c, v;
 
-	if (bl == NULL || bl->qty < 0) return NULL;
-	if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
-
-	for (i = 0, c = 1; i < bl->qty; i++) {
-		v = bl->entry[i]->slen;
-		if (v < 0) return NULL;	/* Invalid input */
-		c += v;
-		if (c < 0) return NULL;	/* Wrap around ?? */
-	}
-
-	if (sep != NULL) c += (bl->qty - 1) * sep->slen;
-
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (NULL == b) return NULL; /* Out of memory */
-	b->data = (unsigned char *) bstr__alloc (c);
-	if (b->data == NULL) {
-		bstr__free (b);
-		return NULL;
-	}
-
-	b->mlen = c;
-	b->slen = c-1;
-
-	for (i = 0, c = 0; i < bl->qty; i++) {
-		if (i > 0 && sep != NULL) {
-			bstr__memcpy (b->data + c, sep->data, sep->slen);
-			c += sep->slen;
-		}
-		v = bl->entry[i]->slen;
-		bstr__memcpy (b->data + c, bl->entry[i]->data, v);
-		c += v;
-	}
-	b->data[c] = (unsigned char) '\0';
-	return b;
+    if (bl == NULL || bl->qty < 0) return NULL;
+    if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+    for (i = 0, c = 1; i < bl->qty; i++) {
+        v = bl->entry[i]->slen;
+        if (v < 0) return NULL;    /* Invalid input */
+        c += v;
+        if (c < 0) return NULL;    /* Wrap around ?? */
+    }
+
+    if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL; /* Out of memory */
+    b->data = (unsigned char *) bstr__alloc (c);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    b->mlen = c;
+    b->slen = c-1;
+
+    for (i = 0, c = 0; i < bl->qty; i++) {
+        if (i > 0 && sep != NULL) {
+            bstr__memcpy (b->data + c, sep->data, sep->slen);
+            c += sep->slen;
+        }
+        v = bl->entry[i]->slen;
+        bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+        c += v;
+    }
+    b->data[c] = (unsigned char) '\0';
+    return b;
 }
 
 #define BSSSC_BUFF_LEN (256)
 
 /*  int bssplitscb (struct bStream * s, const_bstring splitStr, 
- *	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings read from a stream 
  *  divided by any of the characters in splitStr.  An empty splitStr causes 
@@ -2287,56 +2287,56 @@ int i, c, v;
  *  undefined manner.
  */
 int bssplitscb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
 struct charField chrs;
 bstring buff;
 int i, p, ret;
 
-	if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-	if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-	if (splitStr->slen == 0) {
-		while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
-		if ((ret = cb (parm, 0, buff)) > 0) 
-			ret = 0;
-	} else {
-		buildCharField (&chrs, splitStr);
-		ret = p = i = 0;
-		for (;;) {
-			if (i >= buff->slen) {
-				bsreada (buff, s, BSSSC_BUFF_LEN);
-				if (i >= buff->slen) {
-					if (0 < (ret = cb (parm, p, buff))) ret = 0;
-					break;
-				}
-			}
-			if (testInCharField (&chrs, buff->data[i])) {
-				struct tagbstring t;
-				unsigned char c;
-
-				blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
-				if ((ret = bsunread (s, &t)) < 0) break;
-				buff->slen = i;
-				c = buff->data[i];
-				buff->data[i] = (unsigned char) '\0';
-				if ((ret = cb (parm, p, buff)) < 0) break;
-				buff->data[i] = c;
-				buff->slen = 0;
-				p += i + 1;
-				i = -1;
-			}
-			i++;
-		}
-	}
-
-	bdestroy (buff);
-	return ret;
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+        if ((ret = cb (parm, 0, buff)) > 0) 
+            ret = 0;
+    } else {
+        buildCharField (&chrs, splitStr);
+        ret = p = i = 0;
+        for (;;) {
+            if (i >= buff->slen) {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (i >= buff->slen) {
+                    if (0 < (ret = cb (parm, p, buff))) ret = 0;
+                    break;
+                }
+            }
+            if (testInCharField (&chrs, buff->data[i])) {
+                struct tagbstring t;
+                unsigned char c;
+
+                blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+                if ((ret = bsunread (s, &t)) < 0) break;
+                buff->slen = i;
+                c = buff->data[i];
+                buff->data[i] = (unsigned char) '\0';
+                if ((ret = cb (parm, p, buff)) < 0) break;
+                buff->data[i] = c;
+                buff->slen = 0;
+                p += i + 1;
+                i = -1;
+            }
+            i++;
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
 }
 
 /*  int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
- *	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings read from a stream 
  *  divided by the entire substring splitStr.  An empty splitStr causes 
@@ -2353,48 +2353,48 @@ int i, p, ret;
  *  undefined manner.
  */
 int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
 bstring buff;
 int i, p, ret;
 
-	if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-	if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
-
-	if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-	if (splitStr->slen == 0) {
-		for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
-			if ((ret = cb (parm, 0, buff)) < 0) {
-				bdestroy (buff);
-				return ret;
-			}
-			buff->slen = 0;
-		}
-		return BSTR_OK;
-	} else {
-		ret = p = i = 0;
-		for (i=p=0;;) {
-			if ((ret = binstr (buff, 0, splitStr)) >= 0) {
-				struct tagbstring t;
-				blk2tbstr (t, buff->data, ret);
-				i = ret + splitStr->slen;
-				if ((ret = cb (parm, p, &t)) < 0) break;
-				p += i;
-				bdelete (buff, 0, i);
-			} else {
-				bsreada (buff, s, BSSSC_BUFF_LEN);
-				if (bseof (s)) {
-					if ((ret = cb (parm, p, buff)) > 0) ret = 0;
-					break;
-				}
-			}
-		}
-	}
-
-	bdestroy (buff);
-	return ret;
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+            if ((ret = cb (parm, 0, buff)) < 0) {
+                bdestroy (buff);
+                return ret;
+            }
+            buff->slen = 0;
+        }
+        return BSTR_OK;
+    } else {
+        ret = p = i = 0;
+        for (i=p=0;;) {
+            if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+                struct tagbstring t;
+                blk2tbstr (t, buff->data, ret);
+                i = ret + splitStr->slen;
+                if ((ret = cb (parm, p, &t)) < 0) break;
+                p += i;
+                bdelete (buff, 0, i);
+            } else {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (bseof (s)) {
+                    if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+                    break;
+                }
+            }
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
 }
 
 /*  int bstrListCreate (void)
@@ -2403,17 +2403,17 @@ int i, p, ret;
  */
 struct bstrList * bstrListCreate (void) {
 struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (sl) {
-		sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
-		if (!sl->entry) {
-			bstr__free (sl);
-			sl = NULL;
-		} else {
-			sl->qty = 0;
-			sl->mlen = 1;
-		}
-	}
-	return sl;
+    if (sl) {
+        sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+        if (!sl->entry) {
+            bstr__free (sl);
+            sl = NULL;
+        } else {
+            sl->qty = 0;
+            sl->mlen = 1;
+        }
+    }
+    return sl;
 }
 
 /*  int bstrListDestroy (struct bstrList * sl)
@@ -2422,19 +2422,19 @@ struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList)
  */
 int bstrListDestroy (struct bstrList * sl) {
 int i;
-	if (sl == NULL || sl->qty < 0) return BSTR_ERR;
-	for (i=0; i < sl->qty; i++) {
-		if (sl->entry[i]) {
-			bdestroy (sl->entry[i]);
-			sl->entry[i] = NULL;
-		}
-	}
-	sl->qty  = -1;
-	sl->mlen = -1;
-	bstr__free (sl->entry);
-	sl->entry = NULL;
-	bstr__free (sl);
-	return BSTR_OK;
+    if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+    for (i=0; i < sl->qty; i++) {
+        if (sl->entry[i]) {
+            bdestroy (sl->entry[i]);
+            sl->entry[i] = NULL;
+        }
+    }
+    sl->qty  = -1;
+    sl->mlen = -1;
+    bstr__free (sl->entry);
+    sl->entry = NULL;
+    bstr__free (sl);
+    return BSTR_OK;
 }
 
 /*  int bstrListAlloc (struct bstrList * sl, int msz)
@@ -2446,21 +2446,21 @@ int bstrListAlloc (struct bstrList * sl, int msz) {
 bstring * l;
 int smsz;
 size_t nsz;
-	if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-	if (sl->mlen >= msz) return BSTR_OK;
-	smsz = snapUpSize (msz);
-	nsz = ((size_t) smsz) * sizeof (bstring);
-	if (nsz < (size_t) smsz) return BSTR_ERR;
-	l = (bstring *) bstr__realloc (sl->entry, nsz);
-	if (!l) {
-		smsz = msz;
-		nsz = ((size_t) smsz) * sizeof (bstring);
-		l = (bstring *) bstr__realloc (sl->entry, nsz);
-		if (!l) return BSTR_ERR;
-	}
-	sl->mlen = smsz;
-	sl->entry = l;
-	return BSTR_OK;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (sl->mlen >= msz) return BSTR_OK;
+    smsz = snapUpSize (msz);
+    nsz = ((size_t) smsz) * sizeof (bstring);
+    if (nsz < (size_t) smsz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) {
+        smsz = msz;
+        nsz = ((size_t) smsz) * sizeof (bstring);
+        l = (bstring *) bstr__realloc (sl->entry, nsz);
+        if (!l) return BSTR_ERR;
+    }
+    sl->mlen = smsz;
+    sl->entry = l;
+    return BSTR_OK;
 }
 
 /*  int bstrListAllocMin (struct bstrList * sl, int msz)
@@ -2471,20 +2471,20 @@ size_t nsz;
 int bstrListAllocMin (struct bstrList * sl, int msz) {
 bstring * l;
 size_t nsz;
-	if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-	if (msz < sl->qty) msz = sl->qty;
-	if (sl->mlen == msz) return BSTR_OK;
-	nsz = ((size_t) msz) * sizeof (bstring);
-	if (nsz < (size_t) msz) return BSTR_ERR;
-	l = (bstring *) bstr__realloc (sl->entry, nsz);
-	if (!l) return BSTR_ERR;
-	sl->mlen = msz;
-	sl->entry = l;
-	return BSTR_OK;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (msz < sl->qty) msz = sl->qty;
+    if (sl->mlen == msz) return BSTR_OK;
+    nsz = ((size_t) msz) * sizeof (bstring);
+    if (nsz < (size_t) msz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) return BSTR_ERR;
+    sl->mlen = msz;
+    sl->entry = l;
+    return BSTR_OK;
 }
 
 /*  int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by the
  *  character in splitChar.
@@ -2499,25 +2499,25 @@ size_t nsz;
  *  otherwise bsplitcb will continue in an undefined manner.
  */
 int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
-		return BSTR_ERR;
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
+        return BSTR_ERR;
 
-	p = pos;
-	do {
-		for (i=p; i < str->slen; i++) {
-			if (str->data[i] == splitChar) break;
-		}
-		if ((ret = cb (parm, p, i - p)) < 0) return ret;
-		p = i + 1;
-	} while (p <= str->slen);
-	return BSTR_OK;
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (str->data[i] == splitChar) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
 }
 
 /*  int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by any 
  *  of the characters in splitStr.  An empty splitStr causes the whole str to
@@ -2533,35 +2533,35 @@ int i, p, ret;
  *  otherwise bsplitscb will continue in an undefined manner.
  */
 int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 struct charField chrs;
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-	if (splitStr->slen == 0) {
-		if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
-		return ret;
-	}
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+    if (splitStr->slen == 0) {
+        if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+        return ret;
+    }
 
-	if (splitStr->slen == 1) 
-		return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
 
-	buildCharField (&chrs, splitStr);
+    buildCharField (&chrs, splitStr);
 
-	p = pos;
-	do {
-		for (i=p; i < str->slen; i++) {
-			if (testInCharField (&chrs, str->data[i])) break;
-		}
-		if ((ret = cb (parm, p, i - p)) < 0) return ret;
-		p = i + 1;
-	} while (p <= str->slen);
-	return BSTR_OK;
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (testInCharField (&chrs, str->data[i])) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
 }
 
 /*  int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by the 
  *  substring splitStr.  An empty splitStr causes the whole str to be 
@@ -2577,59 +2577,59 @@ int i, p, ret;
  *  otherwise bsplitscb will continue in an undefined manner.
  */
 int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
 
-	if (0 == splitStr->slen) {
-		for (i=pos; i < str->slen; i++) {
-			if ((ret = cb (parm, i, 1)) < 0) return ret;
-		}
-		return BSTR_OK;
-	}
+    if (0 == splitStr->slen) {
+        for (i=pos; i < str->slen; i++) {
+            if ((ret = cb (parm, i, 1)) < 0) return ret;
+        }
+        return BSTR_OK;
+    }
 
-	if (splitStr->slen == 1) 
-		return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
 
-	for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
-		if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
-			if ((ret = cb (parm, p, i - p)) < 0) return ret;
-			i += splitStr->slen;
-			p = i;
-		}
-	}
-	if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
-	return BSTR_OK;
+    for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+        if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+            if ((ret = cb (parm, p, i - p)) < 0) return ret;
+            i += splitStr->slen;
+            p = i;
+        }
+    }
+    if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+    return BSTR_OK;
 }
 
 struct genBstrList {
-	bstring b;
-	struct bstrList * bl;
+    bstring b;
+    struct bstrList * bl;
 };
 
 static int bscb (void * parm, int ofs, int len) {
 struct genBstrList * g = (struct genBstrList *) parm;
-	if (g->bl->qty >= g->bl->mlen) {
-		int mlen = g->bl->mlen * 2;
-		bstring * tbl;
+    if (g->bl->qty >= g->bl->mlen) {
+        int mlen = g->bl->mlen * 2;
+        bstring * tbl;
 
-		while (g->bl->qty >= mlen) {
-			if (mlen < g->bl->mlen) return BSTR_ERR;
-			mlen += mlen;
-		}
+        while (g->bl->qty >= mlen) {
+            if (mlen < g->bl->mlen) return BSTR_ERR;
+            mlen += mlen;
+        }
 
-		tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
-		if (tbl == NULL) return BSTR_ERR;
+        tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+        if (tbl == NULL) return BSTR_ERR;
 
-		g->bl->entry = tbl;
-		g->bl->mlen = mlen;
-	}
+        g->bl->entry = tbl;
+        g->bl->mlen = mlen;
+    }
 
-	g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
-	g->bl->qty++;
-	return BSTR_OK;
+    g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+    g->bl->qty++;
+    return BSTR_OK;
 }
 
 /*  struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
@@ -2640,24 +2640,24 @@ struct genBstrList * g = (struct genBstrList *) parm;
 struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
 struct genBstrList g;
 
-	if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
 
-	g.b = (bstring) str;
-	g.bl->qty = 0;
-	if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 /*  struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
@@ -2668,24 +2668,24 @@ struct genBstrList g;
 struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
 struct genBstrList g;
 
-	if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
 
-	g.b = (bstring) str;
-	g.bl->qty = 0;
-	if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 /*  struct bstrList * bsplits (const_bstring str, bstring splitStr)
@@ -2697,26 +2697,26 @@ struct genBstrList g;
 struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
 struct genBstrList g;
 
-	if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
-	    splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
-		return NULL;
+    if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
+        splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+        return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
-	g.b = (bstring) str;
-	g.bl->qty = 0;
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
+    g.b = (bstring) str;
+    g.bl->qty = 0;
 
-	if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 #if defined (__TURBOC__) && !defined (__BORLANDC__)
@@ -2772,40 +2772,40 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-	 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return BSTR_ERR;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
 
-	r = bconcat (b, buff);
-	bdestroy (buff);
-	return r;
+    r = bconcat (b, buff);
+    bdestroy (buff);
+    return r;
 }
 
 /*  int bassignformat (bstring b, const char * fmt, ...)
@@ -2820,40 +2820,40 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-	 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return BSTR_ERR;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
 
-	r = bassign (b, buff);
-	bdestroy (buff);
-	return r;
+    r = bassign (b, buff);
+    bdestroy (buff);
+    return r;
 }
 
 /*  bstring bformat (const char * fmt, ...)
@@ -2868,37 +2868,37 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (fmt == NULL) return NULL;
+    if (fmt == NULL) return NULL;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return NULL;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return NULL;
+        }
+    }
 
-	return buff;
+    return buff;
 }
 
 /*  int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
@@ -2924,32 +2924,32 @@ int n, r;
 int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
 int n, r, l;
 
-	if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
-	 || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+     || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	if (count > (n = b->slen + count) + 2) return BSTR_ERR;
-	if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+    if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+    if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
 
-	exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+    exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
 
-	/* Did the operation complete successfully within bounds? */
+    /* Did the operation complete successfully within bounds? */
 
-	if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
-		b->slen = l;
-		return BSTR_OK;
-	}
+    if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
+        b->slen = l;
+        return BSTR_OK;
+    }
 
-	/* Abort, since the buffer was not large enough.  The return value 
-	   tries to help set what the retry length should be. */
+    /* Abort, since the buffer was not large enough.  The return value 
+       tries to help set what the retry length should be. */
 
-	b->data[b->slen] = '\0';
-	if (r > count+1) l = r; else {
-		l = count+count;
-		if (count > l) l = INT_MAX;
-	}
-	n = -l;
-	if (n > BSTR_ERR-1) n = BSTR_ERR-1;
-	return n;
+    b->data[b->slen] = '\0';
+    if (r > count+1) l = r; else {
+        l = count+count;
+        if (count > l) l = INT_MAX;
+    }
+    n = -l;
+    if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+    return n;
 }
 
 #endif
diff --git a/src/calculator.c b/src/calculator.c
new file mode 100644
index 0000000..bd73a4d
--- /dev/null
+++ b/src/calculator.c
@@ -0,0 +1,926 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator.c
+ *
+ *      Description:  Infix calculator
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 Brandon Mills
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/*
+ * =======================================================================================
+ *
+ *      Some changes done for the integration in LIKWID, see inline comments
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h> // Temporary
+#include <getopt.h>
+#include <calculator_stack.h>
+
+#define bool char
+#define true 1
+#define false 0
+
+#define PI 3.141592653589793
+
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) to reduce reallocs by allocating a temporary
+ * token for parsing as well as for transforming a number to a string.
+ */
+#define MAXTOKENLENGTH 512
+
+typedef enum
+{
+    addop,
+    multop,
+    expop,
+    lparen,
+    rparen,
+    digit,
+    value,
+    decimal,
+    space,
+    text,
+    function,
+    identifier,
+    argsep,
+    invalid
+} Symbol;
+
+struct Preferences
+{
+    struct Display
+    {
+        bool tokens;
+        bool postfix;
+    } display;
+    struct Mode
+    {
+        bool degrees;
+    } mode;
+} prefs;
+
+typedef enum
+{
+    divZero,
+    overflow,
+    parenMismatch
+} Error;
+
+typedef char* token;
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) to keep track of the
+ * intermediate calculation results to free them in the end
+ */
+token* calcTokens = NULL;
+int nrCalcTokens = 0;
+
+typedef double number;
+
+void raise(Error err)
+{
+    char* msg;
+    switch(err)
+    {
+        case divZero:
+            msg = "Divide by zero";
+            break;
+        case overflow:
+            msg = "Overflow";
+            break;
+        case parenMismatch:
+            msg = "Mismatched parentheses";
+            break;
+    }
+    printf("\tError: %s\n", msg);
+}
+
+inline unsigned int toDigit(char ch)
+{
+    return ch - '0';
+}
+
+number buildNumber(token str)
+{
+    number result = 0;
+    result = strtod(str, NULL);
+    return result;
+}
+
+token num2Str(number num)
+{
+    /* Increased precision by Thomas Roehl (Thomas.Roehl at fau.de) as required for LIKWID */
+    token str = (token)malloc((MAXTOKENLENGTH+1)*sizeof(char));
+    snprintf(str, 39, "%.20f", num);
+    return str;
+}
+
+
+
+inline number toRadians(number degrees)
+{
+    return degrees * PI / 180.0;
+}
+
+inline number toDegrees(number radians)
+{
+    return radians * 180.0 / PI;
+}
+
+token doFunc(token input, token function)
+{
+    number num = buildNumber(input);
+    number result = num;
+
+    if(strcmp(function, "abs") == 0)
+        result = fabs(num);
+    else if(strcmp(function, "floor") == 0)
+        result = floor(num);
+    else if(strcmp(function, "ceil") == 0)
+        result = ceil(num);
+    else if(strcmp(function, "sin") == 0)
+        result = !prefs.mode.degrees ? sin(num) : sin(toRadians(num));
+    else if(strcmp(function, "cos") == 0)
+        result = !prefs.mode.degrees ? cos(num) : cos(toRadians(num));
+    else if(strcmp(function, "tan") == 0)
+        result = !prefs.mode.degrees ? tan(num) : tan(toRadians(num));
+    else if(strcmp(function, "arcsin") == 0
+         || strcmp(function, "asin") == 0)
+        result = !prefs.mode.degrees ? asin(num) : toDegrees(asin(num));
+    else if(strcmp(function, "arccos") == 0
+         || strcmp(function, "acos") == 0)
+        result = !prefs.mode.degrees ? acos(num) : toDegrees(acos(num));
+    else if(strcmp(function, "arctan") == 0
+         || strcmp(function, "atan") == 0)
+        result = !prefs.mode.degrees ? atan(num) : toDegrees(atan(num));
+    else if(strcmp(function, "sqrt") == 0)
+        result = sqrt(num);
+    else if(strcmp(function, "cbrt") == 0)
+        result = cbrt(num);
+    else if(strcmp(function, "log") == 0)
+        result = log(num);
+    else if(strcmp(function, "exp") == 0)
+        result = exp(num);
+    printf("Free %s\n", function);
+    free(function);
+    return num2Str(result);
+}
+
+int doOp(token loperand, token op, token roperand, token *result)
+{
+    /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to return
+     * errors from calculation like devide-by-zero, ... */
+    int err = 0;
+    number lside = buildNumber(loperand);
+    number rside = buildNumber(roperand);
+    number ret;
+    switch(*op)
+    {
+        case '^':
+            {
+                ret = pow(lside, rside);
+            }
+            break;
+        case '*':
+            {
+                ret = lside * rside;
+            }
+            break;
+        case '/':
+            {
+                if(rside == 0)
+                {
+                    /* Changed by Thomas Roehl */
+                    //raise(divZero);
+                    err = -1;
+                }
+                else
+                    ret = lside / rside;
+            }
+            break;
+        case '%':
+            {
+                if(rside == 0)
+                {
+                    /* Changed by Thomas Roehl */
+                    //raise(divZero);
+                    err = -1;
+                }
+                else
+                {
+                    ret = (int)(lside / rside);
+                    ret = lside - (ret * rside);
+                }
+            }
+            break;
+        case '+':
+            {
+                ret = lside + rside;
+            }
+            break;
+        case '-':
+            {
+                ret = lside - rside;
+            }
+            break;
+    }
+    *result = num2Str(ret);
+    return err;
+}
+
+
+Symbol type(char ch)
+{
+    Symbol result;
+    switch(ch)
+    {
+        case '+':
+        case '-':
+            result = addop;
+            break;
+        case '*':
+        case '/':
+        case '%':
+            result = multop;
+            break;
+        case '^':
+            result = expop;
+            break;
+        case '(':
+            result = lparen;
+            break;
+        case ')':
+            result = rparen;
+            break;
+        case '.':
+            result = decimal;
+            break;
+        case ' ':
+            result = space;
+            break;
+        case ',':
+            result = argsep;
+            break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+            result = digit;
+            break;
+        case 'A':
+        case 'B':
+        case 'C':
+        case 'D':
+        case 'E':
+        case 'F':
+        case 'G':
+        case 'H':
+        case 'I':
+        case 'J':
+        case 'K':
+        case 'L':
+        case 'M':
+        case 'N':
+        case 'O':
+        case 'P':
+        case 'Q':
+        case 'R':
+        case 'S':
+        case 'T':
+        case 'U':
+        case 'V':
+        case 'W':
+        case 'X':
+        case 'Y':
+        case 'Z':
+        case 'a':
+        case 'b':
+        case 'c':
+        case 'd':
+        case 'e':
+        case 'f':
+        case 'g':
+        case 'h':
+        case 'i':
+        case 'j':
+        case 'k':
+        case 'l':
+        case 'm':
+        case 'n':
+        case 'o':
+        case 'p':
+        case 'q':
+        case 'r':
+        case 's':
+        case 't':
+        case 'u':
+        case 'v':
+        case 'w':
+        case 'x':
+        case 'y':
+        case 'z':
+            result = text;
+            break;
+        default:
+            result = invalid;
+            break;
+    }
+    return result;
+}
+
+bool isFunction(token tk)
+{
+    return (strcmp(tk, "abs") == 0
+        || strcmp(tk, "floor") == 0
+        || strcmp(tk, "ceil") == 0
+        || strcmp(tk, "sin") == 0
+        || strcmp(tk, "cos") == 0
+        || strcmp(tk, "tan") == 0
+        || strcmp(tk, "arcsin") == 0
+        || strcmp(tk, "arccos") == 0
+        || strcmp(tk, "arctan") == 0
+        || strcmp(tk, "asin") == 0
+        || strcmp(tk, "acos") == 0
+        || strcmp(tk, "atan") == 0
+        || strcmp(tk, "sqrt") == 0
+        || strcmp(tk, "cbrt") == 0
+        || strcmp(tk, "log") == 0
+        || strcmp(tk, "exp") == 0);
+}
+
+Symbol tokenType(token tk)
+{
+    Symbol ret = type(*tk);
+    switch(ret)
+    {
+        case text:
+            if(isFunction(tk))
+                ret = function;
+            else
+                ret = identifier;
+            break;
+        case addop:
+            if(*tk == '-' && strlen(tk) > 1)
+                ret = tokenType(tk+1);
+            break;
+        case decimal:
+        case digit:
+            ret = value;
+            break;
+    }
+    return ret;
+}
+
+int tokenize(char *str, char *(**tokensRef))
+{
+    char** tokens = NULL;
+    char** tmp = NULL;
+    char* ptr = str;
+    char ch = '\0';
+    int numTokens = 0;
+    /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse string
+     * in a temporary token to reduce frequent reallocs. newToken
+     * is replaced by tmpToken during parsing. Removed all reallocs
+     * and not required mallocs from the original code.
+     */
+    char* tmpToken = malloc((MAXTOKENLENGTH+1) * sizeof(char));
+    if (!tmpToken)
+    {
+        fprintf(stderr, "Malloc of temporary buffer failed\n");
+        return 0;
+    }
+    while(ch = *ptr++)
+    {
+        if(type(ch) == invalid) // Stop tokenizing when we encounter an invalid character
+            break;
+
+        token newToken = NULL;
+        /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+         * Prepare temporary token for next parsing step */
+        memset(tmpToken, '\0', MAXTOKENLENGTH+1);
+        switch(type(ch))
+        {
+            case addop:
+                {
+                    // Check if this is a negative
+                    if(ch == '-'
+                        && (numTokens == 0
+                            || (tokenType(tokens[numTokens-1]) == addop
+                                || tokenType(tokens[numTokens-1]) == multop
+                                || tokenType(tokens[numTokens-1]) == expop
+                                || tokenType(tokens[numTokens-1]) == lparen)))
+                    {
+                        // Assemble an n-character (plus null-terminator) number token
+                        {
+                            int len = 1;
+                            bool hasDecimal = false;
+                            bool hasExponent = false;
+
+                            if(type(ch) == decimal) // Allow numbers to start with decimal
+                            {
+                                //printf("Decimal\n");
+                                hasDecimal = true;
+                                len++;
+                                //newToken = (char*)malloc((len + 1) * sizeof(char));
+                                tmpToken[0] = '0';
+                                tmpToken[1] = '.';
+                            }
+                            else // Numbers that do not start with decimal
+                            {
+                                //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+                                tmpToken[len-1] = ch;
+                            }
+
+                            // Assemble rest of number
+                            for(; // Don't change len
+                                *ptr // There is a next character and it is not null
+                                && len <= MAXTOKENLENGTH 
+                                && (type(*ptr) == digit // The next character is a digit
+                                     || ((type(*ptr) == decimal // Or the next character is a decimal
+                                         && hasDecimal == 0)) // But we have not added a decimal
+                                     || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+                                         && hasExponent == false) // But we have not added an exponent yet
+                                     /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse scientific notation
+                                      * with signed exponent correctly
+                                      */
+                                     || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+                                ++len)
+                            {
+                                if(type(*ptr) == decimal)
+                                    hasDecimal = true;
+                                else if(*ptr == 'E' || *ptr == 'e')
+                                    hasExponent = true;
+                                //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+                                tmpToken[len] = *ptr++;
+                            }
+
+                            // Append null-terminator
+                            tmpToken[len] = '\0';
+                        }
+                        break;
+                    }
+                    // If it's not part of a number, it's an op - fall through
+                }
+            case multop:
+            case expop:
+            case lparen:
+            case rparen:
+            case argsep:
+                // Assemble a single-character (plus null-terminator) operation token
+                {
+                    //newToken = (char*)malloc(2 * sizeof(char)); // Leave room for '\0'
+                    tmpToken[0] = ch;
+                    tmpToken[1] = '\0';
+                }
+                break;
+            case digit:
+            case decimal:
+                // Assemble an n-character (plus null-terminator) number token
+                {
+                    int len = 1;
+                    bool hasDecimal = false;
+                    bool hasExponent = false;
+
+                    if(type(ch) == decimal) // Allow numbers to start with decimal
+                    {
+                        //printf("Decimal\n");
+                        hasDecimal = true;
+                        len++;
+                        //newToken = (char*)malloc((len + 1) * sizeof(char));
+                        tmpToken[0] = '0';
+                        tmpToken[1] = '.';
+                    }
+                    else // Numbers that do not start with decimal
+                    {
+                        //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+                        tmpToken[len-1] = ch;
+                    }
+
+                    // Assemble rest of number
+                    /* Added support for signed exponents in scientific notation 
+                     * by Thomas Roehl (Thomas.Roehl at fau.de) as required for LIKWID */
+                    for(; // Don't change len
+                        *ptr // There is a next character and it is not null
+                        && len <= MAXTOKENLENGTH 
+                        && (type(*ptr) == digit // The next character is a digit
+                             || ((type(*ptr) == decimal // Or the next character is a decimal
+                                 && hasDecimal == false)) // But we have not added a decimal
+                             || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+                                 && hasExponent == false) // But we have not added an exponent yet
+                             /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to parse scientific notation
+                              * with signed exponent correctly
+                              */
+                             || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+                        ++len)
+                    {
+                        if(type(*ptr) == decimal)
+                        {
+                            hasDecimal = true;
+                        }
+                        else if(*ptr == 'E' || *ptr == 'e')
+                        {
+                            hasExponent = true;
+                        }
+                        //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+                        tmpToken[len] = *ptr++;
+                    }
+
+                    // Append null-terminator
+                    tmpToken[len] = '\0';
+                }
+                break;
+            case text:
+                // Assemble an n-character (plus null-terminator) text token
+                {
+                    int len = 1;
+                    //newToken = (char*)malloc((len + 1) * sizeof(char)); // Leave room for '\0'
+                    tmpToken[0] = ch;
+                    for(len = 1; *ptr && type(*ptr) == text && len <= MAXTOKENLENGTH; ++len)
+                    {
+                        //newToken = (char*)realloc(newToken, (len + 1) * sizeof(char)); // Leave room for '\0'
+                        tmpToken[len] = *ptr++;
+                    }
+                    tmpToken[len] = '\0';
+                }
+                break;
+        }
+        // Add to list of tokens
+        if(tmpToken[0] != '\0')
+        {
+            numTokens++;
+            /*if(tokens == NULL) // First allocation
+                tokens = (char**)malloc(numTokens * sizeof(char*));
+            else*/
+            /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+             * Allocate new output token and copy temporary token
+             */
+            newToken = malloc((strlen(tmpToken)+1) * sizeof(char));
+            strcpy(newToken, tmpToken);
+            newToken[strlen(tmpToken)] = '\0';
+            tmp = (char**)realloc(tokens, numTokens * sizeof(char*));
+            if (tmp == NULL)
+            {
+                *tokensRef = NULL;
+                free(tmpToken);
+                return 0;
+            }
+            tokens = tmp;
+            tmp = NULL;
+            tokens[numTokens - 1] = newToken;
+        }
+    }
+    *tokensRef = tokens; // Send back out
+    /* Added by Thomas Roehl (Thomas.Roehl at fau.de) */
+    free(tmpToken);
+    return numTokens;
+}
+
+bool leftAssoc(token op)
+{
+    bool ret;
+    switch(tokenType(op))
+    {
+        case addop:
+        case multop:
+            ret = true;
+            break;
+        case expop:
+            ret = false;
+            break;
+    }
+    return ret;
+}
+
+int precedence(token op1, token op2)
+{
+    int ret;
+
+    if(tokenType(op1) == tokenType(op2)) // Equal precedence
+        ret = 0;
+    else if(tokenType(op1) == addop
+            && (tokenType(op2) == multop || tokenType(op2) == expop)) // op1 has lower precedence
+        ret = -1;
+    else if(tokenType(op2) == addop
+            && (tokenType(op1) == multop || tokenType(op1) == expop)) // op1 has higher precedence
+        ret = 1;
+    else if(tokenType(op1) == multop
+            && tokenType(op2) == expop) // op1 has lower precedence
+        ret = -1;
+    else if(tokenType(op1) == expop
+            && tokenType(op2) == multop) // op1 has higher precedence
+        ret = 1;
+
+    return ret;
+}
+
+int evalStackPush(Stack *s, token val)
+{
+    /* Added by Thomas Roehl (Thomas.Roehl at fau.de) to return
+     * calculation errors. Function now returns an int.
+     */
+    int ret = 0;
+    if(prefs.display.postfix)
+        printf("\t%s\n", val);
+
+    switch(tokenType(val))
+    {
+        case function:
+            {
+                token operand, res;
+                operand = (token)stackPop(s);
+                res = doFunc(operand, val);
+                //free(operand);
+                stackPush(s, res);
+            }
+            break;
+        case expop:
+        case multop:
+        case addop:
+            {
+                if(stackSize(s) >= 2)
+                {
+                    // Pop two operands
+                    token l, r, res;
+                    r = (token)stackPop(s);
+                    l = (token)stackPop(s);
+
+                    // Evaluate
+                    /* Added return value by Thomas Roehl (Thomas.Roehl at fau.de) */
+                    ret = doOp(l, val, r, &res);
+                    // Push result
+                    stackPush(s, res);
+                    /* Added by Thomas Roehl (Thomas.Roehl at fau.de)
+                     * Keeping track of the intermediate results
+                     */
+                    calcTokens[nrCalcTokens] = res;
+                    nrCalcTokens++;
+                }
+                else
+                {
+                    stackPush(s, val);
+                }
+            }
+            break;
+        case value:
+            {
+                stackPush(s, val);
+            }
+            break;
+    }
+    /* Return value by Thomas Roehl (Thomas.Roehl at fau.de) */
+    return ret;
+}
+
+int postfix(token *tokens, int numTokens, Stack *output)
+{
+    Stack operators;
+    int i;
+    int err = 0;
+    stackInit(&operators, 2*numTokens);
+    for(i = 0; i < numTokens; i++)
+    {
+        // From Wikipedia/Shunting-yard_algorithm:
+        switch(tokenType(tokens[i]))
+        {
+            case value:
+                {
+                    // If the token is a number, then add it to the output queue.
+                    //printf("Adding number to output stack\n");
+                    err = evalStackPush(output, tokens[i]);
+                }
+                break;
+            case function:
+                {
+                    // If the token is a function token, then push it onto the stack.
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case argsep:
+                {
+                    /*
+                     * If the token is a function argument separator (e.g., a comma):
+                     *     Until the token at the top of the stack is a left
+                     *     paren, pop operators off the stack onto the output
+                     *     queue. If no left paren encountered, either separator
+                     *     was misplaced or parens mismatched.
+                     */
+                    while(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen
+                        && stackSize(&operators) > 1
+                        && err == 0)
+                    {
+                        //printf("Moving operator from operator stack to output stack\n");
+                        token t = (token)stackPop(&operators);
+                        err = evalStackPush(output, t);
+                        //free(t);
+                    }
+                    if(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen)
+                    {
+                        err = -1;
+                        /* Changed by Thomas Roehl */
+                        //raise(parenMismatch);
+                    }
+                    //printf("Removing left paren from operator stack\n");
+                    token t = stackPop(&operators); // Discard lparen
+                    //free(t);
+                }
+                break;
+            case addop:
+            case multop:
+            case expop:
+                {
+                    /*
+                     * If the token is an operator, op1, then:
+                     *     while there is an operator token, op2, at the top of the stack, and
+                     *             either op1 is left-associative and its precedence is less than or equal to that of op2,
+                     *             or op1 is right-associative and its precedence is less than that of op2,
+                     *         pop op2 off the stack, onto the output queue
+                     *     push op1 onto the stack
+                     */
+                    while(stackSize(&operators) > 0
+                        && (tokenType((char*)stackTop(&operators)) == addop || tokenType((char*)stackTop(&operators)) == multop || tokenType((char*)stackTop(&operators)) == expop)
+                        && ((leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) <= 0)
+                            || (!leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) < 0))
+                        && err == 0)
+                    {
+                        //printf("Moving operator from operator stack to output stack\n");
+                        token t = (token)stackPop(&operators);
+                        err = evalStackPush(output, t);
+                        //free(t);
+                    }
+                    //printf("Adding operator to operator stack\n");
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case lparen:
+                {
+                    // If the token is a left paren, then push it onto the stack
+                    //printf("Adding left paren to operator stack\n");
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case rparen:
+                {
+                    /*
+                     * If the token is a right paren:
+                     *     Until the token at the top of the stack is a left paren, pop operators off the stack onto the output queue
+                     *     Pop the left paren from the stack, but not onto the output queue
+                     *     If the stack runs out without finding a left paren, then there are mismatched parens
+                     */
+                    while(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen
+                        && stackSize(&operators) > 1
+                        && err == 0)
+                    {
+                        //printf("Moving operator from operator stack to output stack\n");
+                        token t = (token)stackPop(&operators);
+                        err = evalStackPush(output, t);
+                        //free(t);
+                    }
+                    if(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen)
+                    {
+                        err = -1;
+                        /* Changed by Thomas Roehl */
+                        //raise(parenMismatch);
+                    }
+                    //printf("Removing left paren from operator stack\n");
+                    token t = (token)stackPop(&operators);
+                    //stackPop(&operators); // Discard lparen
+                    //free(t);
+                }
+                break;
+        }
+        if (err)
+            break;
+    }
+    /*
+     * When there are no more tokens to read:
+     *     While there are still operator tokens on the stack:
+     *         If the operator token on the top of the stack is a paren, then there are mismatched parens
+     *         Pop the operator onto the output queue
+     */
+    while(stackSize(&operators) > 0)
+    {
+        if(tokenType((token)stackTop(&operators)) == lparen)
+        {
+            /* Changed by Thomas Roehl */
+            //raise(parenMismatch);
+            err = -1;
+        }
+        //printf("Moving operator from operator stack to output stack\n");
+        token t = (token)stackPop(&operators);
+        err = evalStackPush(output, t);
+        //free(t);
+    }
+    stackFree(&operators);
+    return err;
+}
+
+
+
+/* Added by Thomas Roehl (Thomas.Roehl at fau.de) as interface for LIKWID */
+int calculate_infix(char* finfix, double *result)
+{
+    int i;
+    int ret = 0;
+    *result = 0;
+    token* tokens = NULL;
+    Stack expr;
+    nrCalcTokens = 0;
+    int numTokens = tokenize(finfix, &tokens);
+    calcTokens = (token*)malloc(2 * numTokens * sizeof(token));
+    if (calcTokens == NULL)
+    {
+        ret = -1;
+        *result = NAN;
+    }
+    memset(calcTokens, 0, 2 * numTokens * sizeof(token));
+    stackInit(&expr, 2*numTokens);
+    ret = postfix(tokens, numTokens, &expr);
+    if ((stackSize(&expr) != 1) || (ret < 0))
+    {
+        *result = NAN;
+        goto calcerror;
+    }
+    else
+    {
+        *result = strtod((char*)stackTop(&expr), NULL);
+    }
+    ret = 0;
+calcerror:
+    for (i=0;i<nrCalcTokens; i++)
+    {
+        if (calcTokens[i] != NULL)
+            free(calcTokens[i]);
+    }
+    if (calcTokens)
+        free(calcTokens);
+    calcTokens = NULL;
+    nrCalcTokens = 0;
+    for (i=0;i<numTokens;i++)
+    {
+        if (tokens[i])
+        {
+            free(tokens[i]);
+        }
+    }
+    if (tokens)
+    {
+        free(tokens);
+        tokens = NULL;
+        numTokens = 0;
+    }
+    stackFree(&expr);
+    return ret;
+}
+
+
diff --git a/src/calculator_stack.c b/src/calculator_stack.c
new file mode 100644
index 0000000..e14acee
--- /dev/null
+++ b/src/calculator_stack.c
@@ -0,0 +1,77 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator_stack.c
+ *
+ *      Description:  Stack implementation for infix calculator
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <calculator_stack.h>
+
+void stackInit(Stack *s, int size)
+{
+    s->content = malloc(size * sizeof(void*));
+    s->size = size;
+    s->top = -1;
+}
+
+void stackPush(Stack *s, void* val)
+{
+    (s->top)++;
+    s->content[s->top] = val;
+}
+
+void* stackTop(Stack *s)
+{
+    void *ret = NULL;
+    if(s->top >= 0 && s->content != NULL)
+        ret = s->content[s->top];
+    return ret;
+}
+
+void* stackPop(Stack *s)
+{
+    void *ret = NULL;
+    if(s->top >= 0 && s->content != NULL)
+        ret = s->content[(s->top)--];
+    return ret;
+}
+
+int stackSize(Stack *s)
+{
+    return s->top + 1;
+}
+
+void stackFree(Stack *s)
+{
+    if (s->content)
+        free(s->content);
+    s->content = NULL;
+    s->size = 0;
+    s->top = -1;
+}
+
diff --git a/src/configuration.c b/src/configuration.c
new file mode 100644
index 0000000..f7a9357
--- /dev/null
+++ b/src/configuration.c
@@ -0,0 +1,339 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  configuration.c
+ *
+ *      Description:  Configuration file module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+
+
+#include <configuration.h>
+
+Configuration config = {NULL,NULL,NULL,NULL,-1,MAX_NUM_THREADS,MAX_NUM_NODES};
+int init_config = 0;
+
+static int daemonPath_len = 0;
+static int groupPath_len = 0;
+
+static int default_configuration(void)
+{
+    int ret = 0;
+    char filename[1024] = { [0 ... 1023] = '\0' };
+    char *fptr = NULL;
+    size_t len = 0;
+    filename[0] = '\0';
+    if (ACCESSMODE == 0)
+    {
+        config.daemonMode = ACCESSMODE_DIRECT;
+        init_config = 1;
+        return 0;
+    }
+    config.daemonMode = ACCESSMODE_DAEMON;
+    
+    groupPath_len = strlen(TOSTRING(GROUPPATH))+10;
+    config.groupPath = malloc(groupPath_len+1);
+    ret = snprintf(config.groupPath, groupPath_len, "%s", TOSTRING(GROUPPATH));
+    config.groupPath[ret] = '\0';
+    
+    
+    FILE* fp = popen("which likwid-accessD 2>/dev/null | tr -d '\n'","r");
+    if (fp == NULL)
+    {
+        goto use_hardcoded;
+    }
+    ret = getline(&fptr, &len, fp);
+    if (ret < 0)
+    {
+        fclose(fp);
+        if (fptr)
+            free(fptr);
+        goto use_hardcoded;
+    }
+    if (!access(fptr, X_OK))
+    {
+        config.daemonPath = (char*)malloc((len+1) * sizeof(char));
+        strncpy(config.daemonPath, fptr, len);
+        config.daemonPath[len] = '\0';
+        if (fptr)
+            free(fptr);
+    }
+    else
+    {
+        fprintf(stderr, "Found access daemon at %s but it is not executable, using compiled in daemon path.\n", fptr);
+        fclose(fp);
+        if (fptr)
+            free(fptr);
+        goto use_hardcoded;
+    }
+    init_config = 1;
+    fclose(fp);
+    return 0;
+use_hardcoded:
+    ret = sprintf(filename,"%s", TOSTRING(ACCESSDAEMON));
+    filename[ret] = '\0';
+    if (!access(filename, X_OK))
+    {
+        config.daemonPath = (char*)malloc((strlen(filename)+1) * sizeof(char));
+        strcpy(config.daemonPath, filename);
+        init_config = 1;
+    }
+    else
+    {
+        ERROR_PLAIN_PRINT(Unable to get path to access daemon. Maybe your PATH environment variable does not contain the folder where you installed it or the file was moved away / not copied to that location?);
+        exit(EXIT_FAILURE);
+    }
+    return 0;
+}
+
+int init_configuration(void)
+{
+    int i;
+    FILE* fp;
+    char line[512];
+    char name[128];
+    char value[256];
+    char filename[1024];
+    filename[0] = '\0';
+    char preconfigured[1024];
+    preconfigured[0] = '\0';
+    if (init_config == 1)
+    {
+        return 0;
+    }
+    sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),TOSTRING(CFGFILE));
+
+    if (access(preconfigured, R_OK) != 0)
+    {
+        if (access(TOSTRING(CFGFILE), R_OK) != 0)
+        {
+            if (!access("/etc/likwid.cfg",R_OK))
+            {
+                sprintf(filename,"%s", "/etc/likwid.cfg");
+            }
+        }
+        else
+        {
+            sprintf(filename,"%s",TOSTRING(CFGFILE));
+        }
+    }
+    else
+    {
+        sprintf(filename, "%s",preconfigured);
+    }
+    
+    if ((config.topologyCfgFileName == NULL) && (strlen(filename) == 0))
+    {
+        if (!access(TOSTRING(TOPOFILE), R_OK))
+        {
+            preconfigured[0] = '\0';
+            sprintf(preconfigured,"%s", TOSTRING(TOPOFILE));
+        }
+        else
+        {
+            sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),TOSTRING(TOPOFILE));
+            if (access(preconfigured, R_OK))
+            {
+                preconfigured[0] = '\0';
+            }
+        }
+        if (preconfigured[0] != '\0')
+        {
+            config.topologyCfgFileName = (char*)malloc((strlen(preconfigured)+1) * sizeof(char));
+            strcpy(config.topologyCfgFileName, preconfigured);
+            config.topologyCfgFileName[strlen(preconfigured)] = '\0';
+        }
+    }
+
+    if ((strlen(filename) == 0) || (!access(filename, R_OK)))
+    {
+        return default_configuration();
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Reading configuration from %s, filename);
+    // Copy determined config filename to struct
+    config.configFileName = malloc((strlen(filename)+1)*sizeof(char));
+    strcpy(config.configFileName, filename);
+    config.configFileName[strlen(filename)] = '\0';
+
+    fp = fopen(config.configFileName, "r");
+    if (fp == NULL)
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Using compile-time configuration)
+        return default_configuration();
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Reading configuration from %s, filename)
+    while (fgets(line, 512, fp) != NULL) {
+        if (sscanf(line,"%s = %s", name, value) != 2)
+        {
+            continue;
+        }
+        if (strncmp(name, "#", 1) == 0)
+        {
+            continue;
+        }
+        if (strcmp(name, "topology_file") == 0)
+        {
+            config.topologyCfgFileName = (char*)malloc((strlen(value)+1) * sizeof(char));
+            strcpy(config.topologyCfgFileName, value);
+            config.topologyCfgFileName[strlen(value)] = '\0';
+        }
+        else if (strcmp(name, "daemon_path") == 0)
+        {
+            config.daemonPath = (char*)malloc((strlen(value)+1) * sizeof(char));
+            strcpy(config.daemonPath, value);
+            config.daemonPath[strlen(value)] = '\0';
+            if (access(config.daemonPath, R_OK))
+            {
+                if (default_configuration() < 0)
+                {
+                    ERROR_PLAIN_PRINT(Unable to get path to access daemon);
+                    exit(EXIT_FAILURE);
+                }
+            }
+        }
+        else if (strcmp(name, "groupPath") == 0)
+        {
+            struct stat st;
+            stat(value, &st);
+            if (S_ISDIR(st.st_mode))
+            {
+                config.groupPath = (char*)malloc((strlen(value)+1) * sizeof(char));
+                strcpy(config.groupPath, value);
+                config.groupPath[strlen(value)] = '\0';
+            }
+            else
+            {
+                ERROR_PRINT(Path to group files %s is not a directory, value);
+                exit(EXIT_FAILURE);
+            }
+        }
+        else if (strcmp(name, "daemon_mode") == 0)
+        {
+            if (strcmp(value, "daemon") == 0)
+            {
+                config.daemonMode = ACCESSMODE_DAEMON;
+            }
+            else if (strcmp(value, "direct") == 0)
+            {
+                config.daemonMode = ACCESSMODE_DIRECT;
+            }
+        }
+        else if (strcmp(name, "max_threads") == 0)
+        {
+            config.maxNumThreads = atoi(value);
+        }
+        else if (strcmp(name, "max_nodes") == 0)
+        {
+            config.maxNumNodes = atoi(value);
+        }
+    }
+
+
+    init_config = 1;
+
+    fclose(fp);
+    return 0;
+}
+
+Configuration_t get_configuration(void)
+{
+    if (init_config == 1)
+    {
+        return &config;
+    }
+    return NULL;
+}
+
+int destroy_configuration(void)
+{
+    if (init_config == 0)
+    {
+        return -EFAULT;
+    }
+    if (config.configFileName != NULL)
+    {
+        free(config.configFileName);
+    }
+    if (config.groupPath != NULL)
+    {
+        free(config.groupPath);
+    }
+    if (config.topologyCfgFileName != NULL)
+    {
+        free(config.topologyCfgFileName);
+    }
+    if (config.daemonMode != ACCESSMODE_DIRECT)
+    {
+        if (config.daemonPath != NULL)
+        {
+            free(config.daemonPath);
+        }
+    }
+    init_config = 0;
+    return 0;
+}
+
+int config_setGroupPath(char* path)
+{
+    int ret = 0;
+    struct stat st;
+    char* new;
+    stat(path, &st);
+    if (S_ISDIR(st.st_mode))
+    {
+        if (strlen(path)+1 > groupPath_len)
+        {
+            new = malloc(strlen(path)+1);
+            if (new == NULL)
+            {
+                printf("Cannot allocate space for new group path\n");
+                return -ENOMEM;
+            }
+            ret = sprintf(new, "%s", path);
+            new[ret] = '\0';
+            if (config.groupPath)
+                free(config.groupPath);
+            config.groupPath = new;
+            groupPath_len = strlen(path);
+        }
+        else
+        {
+            ret = snprintf(config.groupPath, groupPath_len, "%s", path);
+            config.groupPath[ret] = '\0';
+        }
+        return 0;
+    }
+    printf("Given path is no directory\n");
+    return -ENOTDIR;
+}
diff --git a/src/cpuFeatures.c b/src/cpuFeatures.c
index 4733a82..e3ecfdc 100644
--- a/src/cpuFeatures.c
+++ b/src/cpuFeatures.c
@@ -9,13 +9,13 @@
  *                  Allows to turn on and off the Hardware prefetcher
  *                  available.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,17 +37,18 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
-
+#include <errno.h>
 #include <types.h>
-#include <msr.h>
-#include <cpuid.h>
+#include <access.h>
+#include <topology.h>
 #include <registers.h>
 #include <textcolor.h>
-#include <cpuFeatures.h>
+#include <likwid.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
-CpuFeatureFlags cpuFeatureFlags;
+static uint64_t cpuFeatureMask[MAX_NUM_THREADS] = {0x0ULL};
+static int features_initialized = 0;
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
@@ -59,341 +60,501 @@ CpuFeatureFlags cpuFeatureFlags;
 
 #define TEST_FLAG(feature,flag)  \
     if (flags & (1ULL<<(flag)))   \
-    {                    \
-        cpuFeatureFlags.feature = 1; \
-    }                    \
-    else                \
-    {                \
-        cpuFeatureFlags.feature = 0; \
+    { \
+        cpuFeatureMask[cpu] |= (1ULL<<feature); \
+    } \
+    else \
+    { \
+        cpuFeatureMask[cpu] &= ~(1ULL<<feature); \
     }
 
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-cpuFeatures_init(int cpu)
-{
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
-    TEST_FLAG(fastStrings,0);
-    TEST_FLAG(thermalControl,3);
-    TEST_FLAG(perfMonitoring,7);
-    TEST_FLAG(branchTraceStorage,11);
-    TEST_FLAG(pebs,12);
-    TEST_FLAG(speedstep,16);
-    TEST_FLAG(monitor,18);
-    TEST_FLAG(cpuidMaxVal,22);
-    TEST_FLAG(xdBit,34);
-
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX))
-    {
-        /*Nehalem */
-        TEST_FLAG(turboMode,38);
-        TEST_FLAG(hardwarePrefetcher,9);
-        TEST_FLAG(clPrefetcher,19);
-        TEST_FLAG(dcuPrefetcher,37);
-        TEST_FLAG(ipPrefetcher,39);
-    }
-    else if ((cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
-    {
-        /*Core 2*/
-        TEST_FLAG(hardwarePrefetcher,9);
-        TEST_FLAG(ferrMultiplex,10);
-        TEST_FLAG(clPrefetcher,19);
-        TEST_FLAG(speedstepLock,20);
-        TEST_FLAG(dcuPrefetcher,37);
-        TEST_FLAG(dynamicAcceleration,38);
-        TEST_FLAG(ipPrefetcher,39);
+#define TEST_FLAG_INV(feature,flag)  \
+    if (flags & (1ULL<<(flag)))   \
+    { \
+        cpuFeatureMask[cpu] &= ~(1ULL<<feature); \
+    } \
+    else \
+    { \
+        cpuFeatureMask[cpu] |= (1ULL<<feature); \
     }
 
-    /*
-    printf("FLAGS: 0x%llX \n",flags);
-    */
-}
+#define IF_FLAG(feature) (cpuFeatureMask[cpu] & (1ULL<<feature))
 
-void
-cpuFeatures_print(int cpu)
-{
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
 
-    printf(HLINE);
-    printf("Fast-Strings: \t\t\t");
-    if (flags & 1)
-    {
-        PRINT_VALUE(GREEN,enabled);
-    }
-    else
+/* #####   FUNCTIONS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+static void cpuFeatures_update(int cpu)
+{
+    int ret;
+    uint64_t flags = 0x0ULL;
+    ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
+    if (ret != 0)
     {
-        PRINT_VALUE(RED,disabled);
+        fprintf(stderr, "Cannot read register 0x%X on cpu %d: err %d\n", MSR_IA32_MISC_ENABLE, cpu, ret);
     }
 
-    printf("Automatic Thermal Control: \t");
-    if (flags & (1ULL<<3))
+    /*cpuFeatureFlags.fastStrings = 0;
+    cpuFeatureFlags.thermalControl = 0;
+    cpuFeatureFlags.perfMonitoring = 0;
+    cpuFeatureFlags.hardwarePrefetcher = 0;
+    cpuFeatureFlags.ferrMultiplex = 0;
+    cpuFeatureFlags.branchTraceStorage = 0;
+    cpuFeatureFlags.pebs = 0;
+    cpuFeatureFlags.speedstep = 0;
+    cpuFeatureFlags.monitor = 0;
+    cpuFeatureFlags.clPrefetcher = 0;
+    cpuFeatureFlags.speedstepLock = 0;
+    cpuFeatureFlags.cpuidMaxVal = 0;
+    cpuFeatureFlags.xdBit = 0;
+    cpuFeatureFlags.dcuPrefetcher = 0;
+    cpuFeatureFlags.dynamicAcceleration = 0;
+    cpuFeatureFlags.turboMode = 0;
+    cpuFeatureFlags.ipPrefetcher = 0;*/
+
+    TEST_FLAG(FEAT_FAST_STRINGS,0);
+    TEST_FLAG(FEAT_THERMAL_CONTROL,3);
+    TEST_FLAG(FEAT_PERF_MON,7);
+    TEST_FLAG_INV(FEAT_BRANCH_TRACE_STORAGE,11);
+    TEST_FLAG_INV(FEAT_PEBS,12);
+    TEST_FLAG(FEAT_SPEEDSTEP,16);
+    TEST_FLAG(FEAT_MONITOR,18);
+    TEST_FLAG(FEAT_CPUID_MAX_VAL,22);
+    TEST_FLAG_INV(FEAT_XTPR_MESSAGE, 23);
+    TEST_FLAG_INV(FEAT_XD_BIT,34);
+
+    if ((cpuid_info.model == CORE2_45) ||
+        (cpuid_info.model == CORE2_65))
     {
-        PRINT_VALUE(GREEN,enabled);
+        TEST_FLAG_INV(FEAT_HW_PREFETCHER,9);
+        TEST_FLAG(FEAT_FERR_MULTIPLEX,10);
+        TEST_FLAG(FEAT_TM2,13);
+        TEST_FLAG_INV(FEAT_CL_PREFETCHER,19);
+        TEST_FLAG(FEAT_SPEEDSTEP_LOCK,20);
+        TEST_FLAG_INV(FEAT_DCU_PREFETCHER,37);
+        TEST_FLAG_INV(FEAT_DYN_ACCEL,38);
+        TEST_FLAG_INV(FEAT_IP_PREFETCHER,39);
     }
-    else
+    else if ((cpuid_info.model == NEHALEM) ||
+             (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+             (cpuid_info.model == NEHALEM_WESTMERE) ||
+             (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+             (cpuid_info.model == NEHALEM_EX) ||
+             (cpuid_info.model == WESTMERE_EX) ||
+             (cpuid_info.model == ATOM_SILVERMONT_E) ||
+             (cpuid_info.model == ATOM_SILVERMONT_C) ||
+             (cpuid_info.model == ATOM_SILVERMONT_Z1) ||
+             (cpuid_info.model == ATOM_SILVERMONT_Z2) ||
+             (cpuid_info.model == ATOM_SILVERMONT_F) ||
+             (cpuid_info.model == ATOM_SILVERMONT_AIR) ||
+             (cpuid_info.model == SANDYBRIDGE) ||
+             (cpuid_info.model == SANDYBRIDGE_EP) ||
+             (cpuid_info.model == IVYBRIDGE) ||
+             (cpuid_info.model == IVYBRIDGE_EP) ||
+             (cpuid_info.model == HASWELL) ||
+             (cpuid_info.model == HASWELL_M1) ||
+             (cpuid_info.model == HASWELL_M2) ||
+             (cpuid_info.model == HASWELL_EP) ||
+             (cpuid_info.model == BROADWELL) ||
+             (cpuid_info.model == BROADWELL_D) ||
+             (cpuid_info.model == BROADWELL_E) ||
+             (cpuid_info.model == SKYLAKE1) ||
+             (cpuid_info.model == SKYLAKE2))
     {
-        PRINT_VALUE(RED,disabled);
+        TEST_FLAG_INV(FEAT_TURBO_MODE,38);
     }
 
-    printf("Performance monitoring: \t");
-    if (flags & (1ULL<<7))
-    {
-        PRINT_VALUE(GREEN,enabled);
-    }
-    else
+    if ((cpuid_info.model == NEHALEM) ||
+            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+            (cpuid_info.model == NEHALEM_WESTMERE) ||
+            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+            (cpuid_info.model == NEHALEM_EX) ||
+            (cpuid_info.model == WESTMERE_EX) ||
+            (cpuid_info.model == SANDYBRIDGE) ||
+            (cpuid_info.model == SANDYBRIDGE_EP) ||
+            (cpuid_info.model == IVYBRIDGE) ||
+            (cpuid_info.model == IVYBRIDGE_EP) ||
+            (cpuid_info.model == HASWELL) ||
+            (cpuid_info.model == HASWELL_M1) ||
+            (cpuid_info.model == HASWELL_M2) ||
+            (cpuid_info.model == HASWELL_EP) ||
+            (cpuid_info.model == BROADWELL) ||
+            (cpuid_info.model == BROADWELL_D) ||
+            (cpuid_info.model == BROADWELL_E) ||
+            (cpuid_info.model == SKYLAKE1) ||
+            (cpuid_info.model == SKYLAKE2))
     {
-        PRINT_VALUE(RED,disabled);
+        ret = HPMread(cpu, MSR_DEV, MSR_PREFETCH_ENABLE, &flags);
+        if (ret != 0)
+        {
+            fprintf(stderr, "Cannot read register 0x%X on cpu %d: err %d\n", MSR_PREFETCH_ENABLE, cpu, ret);
+        }
+        TEST_FLAG_INV(FEAT_IP_PREFETCHER,3);
+        TEST_FLAG_INV(FEAT_DCU_PREFETCHER,2);
+        TEST_FLAG_INV(FEAT_CL_PREFETCHER,1);
+        TEST_FLAG_INV(FEAT_HW_PREFETCHER,0);
     }
-    printf("Branch Trace Storage: \t\t");
+}
 
-    if (flags & (1ULL<<11))
-    {
-        PRINT_VALUE(RED,notsupported);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,supported);
-    }
+static char* cpuFeatureNames[CPUFEATURES_MAX] = {
+    [FEAT_HW_PREFETCHER] = "Hardware Prefetcher",
+    [FEAT_IP_PREFETCHER] = "IP Prefetcher",
+    [FEAT_DCU_PREFETCHER] = "DCU Pretecher",
+    [FEAT_CL_PREFETCHER] = "Adjacent Cache Line Prefetcher",
+    [FEAT_FAST_STRINGS] = "Fast-Strings",
+    [FEAT_THERMAL_CONTROL] = "Automatic Thermal Control Circuit",
+    [FEAT_PERF_MON] = "Performance Monitoring",
+    [FEAT_BRANCH_TRACE_STORAGE] = "Branch Trace Storage",
+    [FEAT_PEBS] = "Precise Event Based Sampling (PEBS)",
+    [FEAT_SPEEDSTEP] = "Enhanced Intel SpeedStep Technology",
+    [FEAT_MONITOR] = "MONITOR/MWAIT",
+    [FEAT_CPUID_MAX_VAL] = "Limit CPUID Maxval",
+    [FEAT_XD_BIT] = "Execute Disable Bit",
+    [FEAT_TURBO_MODE] = "Intel Turbo Mode",
+    [FEAT_DYN_ACCEL] = "Intel Dynamic Acceleration",
+    [FEAT_FERR_MULTIPLEX] = "FERR# Multiplexing",
+    [FEAT_XTPR_MESSAGE] = "xTPR Message",
+    [FEAT_TM2] = "Thermal Monitoring 2",
+    [FEAT_SPEEDSTEP_LOCK] = "Enhanced Intel SpeedStep Technology Select Lock",
+};
 
-    printf("PEBS: \t\t\t\t");
-    if (flags & (1ULL<<12))
-    {
-        PRINT_VALUE(RED,notsupported);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,supported);
-    }
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-    printf("Intel Enhanced SpeedStep: \t");
-    if (flags & (1ULL<<16))
-    {
-        PRINT_VALUE(GREEN,enabled);
-    }
-    else
+void
+cpuFeatures_init()
+{
+    int i;
+    if (features_initialized)
     {
-        PRINT_VALUE(RED,disabled);
+        return;
     }
 
-    printf("MONITOR/MWAIT: \t\t\t");
-    if (flags & (1ULL<<18))
+    topology_init();
+    if (!HPMinitialized())
     {
-        PRINT_VALUE(GREEN,supported);
+        HPMinit();
+        
     }
-    else
+    for (i = 0; i < cpuid_topology.numHWThreads; i++)
     {
-        PRINT_VALUE(RED,notsupported);
+        HPMaddThread(cpuid_topology.threadPool[i].apicId);
+        cpuFeatures_update(cpuid_topology.threadPool[i].apicId);
     }
 
-    printf("Limit CPUID Maxval: \t\t");
-    if (flags & (1ULL<<22))
-    {
-        PRINT_VALUE(RED,enabled);
-    }
-    else
-    {
-        PRINT_VALUE(GREEN,disabled);
-    }
+    
+    features_initialized = 1;
+}
 
-    printf("XD Bit Disable: \t\t");
-    if (flags & (1ULL<<34))
-    {
-        PRINT_VALUE(RED,disabled);
-    }
-    else
+void
+cpuFeatures_print(int cpu)
+{
+    int i;
+    uint64_t flags = 0x0ULL;
+    if (!features_initialized)
     {
-        PRINT_VALUE(GREEN,enabled);
+        return;
     }
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
-    {
-        printf("IP Prefetcher: \t\t\t");
-        if (flags & (1ULL<<39))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
+    cpuFeatures_update(cpu);
 
-        printf("Hardware Prefetcher: \t\t");
-        if (flags & (1ULL<<9))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
-        printf("Adjacent Cache Line Prefetch: \t");
-        if (flags & (1ULL<<19))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
-
-        printf("DCU Prefetcher: \t\t");
-        if (flags & (1ULL<<37))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
-    }
-
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX))
+    printf(HLINE);
+    for (i=0;i<CPUFEATURES_MAX; i++)
     {
-        printf("Intel Turbo Mode: \t\t");
-        if (flags & (1ULL<<38))
+        if ((cpuid_info.model != CORE2_45) &&
+            (cpuid_info.model != CORE2_65) &&
+            ((i == FEAT_FERR_MULTIPLEX) ||
+             (i == FEAT_DYN_ACCEL) ||
+             (i == FEAT_SPEEDSTEP_LOCK) ||
+             (i == FEAT_TM2)))
         {
-            PRINT_VALUE(RED,disabled);
+            continue;
         }
-        else
+        printf("%-48s: ",cpuFeatureNames[i]);
+        if (IF_FLAG(i))
         {
-            PRINT_VALUE(GREEN,enabled);
-        }
-    }
-    else if ((cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
-    {
-
-        printf("Intel Dynamic Acceleration: \t");
-        if (flags & (1ULL<<38))
-        {
-            PRINT_VALUE(RED,disabled);
+            PRINT_VALUE(GREEN, enabled);
         }
         else
         {
-            PRINT_VALUE(GREEN,enabled);
+            PRINT_VALUE(RED,disabled);
         }
     }
-
     printf(HLINE);
 }
 
-void
-cpuFeatures_enable(int cpu, CpuFeature type)
+int
+cpuFeatures_enable(int cpu, CpuFeature type, int print)
 {
+    int ret;
+    uint64_t flags;
+    uint32_t reg = MSR_IA32_MISC_ENABLE;
+    int newOffsets = 0;
+    if (IF_FLAG(type))
+    {
+        return 0;
+    }
     if ((cpuid_info.model == NEHALEM) ||
             (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
             (cpuid_info.model == NEHALEM_WESTMERE) ||
             (cpuid_info.model == NEHALEM_WESTMERE_M) ||
             (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
+            (cpuid_info.model == WESTMERE_EX) ||
+            (cpuid_info.model == SANDYBRIDGE) ||
+            (cpuid_info.model == SANDYBRIDGE_EP) ||
+            (cpuid_info.model == IVYBRIDGE) ||
+            (cpuid_info.model == IVYBRIDGE_EP) ||
+            (cpuid_info.model == HASWELL) ||
+            (cpuid_info.model == HASWELL_M1) ||
+            (cpuid_info.model == HASWELL_M2) ||
+            (cpuid_info.model == HASWELL_EP) ||
+            (cpuid_info.model == BROADWELL) ||
+            (cpuid_info.model == BROADWELL_D) ||
+            (cpuid_info.model == BROADWELL_E) ||
+            (cpuid_info.model == SKYLAKE1) ||
+            (cpuid_info.model == SKYLAKE2))
     {
-        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-        switch ( type )
-        {
-            case HW_PREFETCHER:
+        reg = MSR_PREFETCH_ENABLE;
+        newOffsets = 1;
+    }
+    ret = HPMread(cpu, MSR_DEV, reg, &flags);
+    if (ret != 0)
+    {
+        fprintf(stderr, "Cannot read register 0x%X for CPU %d to activate feature %s\n", reg, cpu, cpuFeatureNames[type]);
+        return ret;
+    }
+    ret = 0;
+    switch ( type )
+    {
+        case FEAT_HW_PREFETCHER:
+            if (print)
                 printf("HW_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags &= ~(1ULL<<0);
+            }
+            else
+            {
                 flags &= ~(1ULL<<9);
-                break;
+            }
+            break;
 
-            case CL_PREFETCHER:
+        case FEAT_CL_PREFETCHER:
+            if (print)
                 printf("CL_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags &= ~(1ULL<<1);
+            }
+            else
+            {
                 flags &= ~(1ULL<<19);
-                break;
+            }
+            break;
 
-            case DCU_PREFETCHER:
+        case FEAT_DCU_PREFETCHER:
+            if (print)
                 printf("DCU_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags &= ~(1ULL<<2);
+            }
+            else
+            {
                 flags &= ~(1ULL<<37);
-                break;
+            }
+            break;
 
-            case IP_PREFETCHER:
+        case FEAT_IP_PREFETCHER:
+            if (print)
                 printf("IP_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags &= ~(1ULL<<3);
+            }
+            else
+            {
                 flags &= ~(1ULL<<39);
-                break;
+            }
+            break;
+
+        default:
+            printf("\nERROR: Processor feature '%s' cannot be enabled!\n", cpuFeatureNames[type]);
+            ret = -EINVAL;
+            break;
+    }
+    if (ret != 0)
+    {
+        return ret;
+    }
 
-            default:
-                printf("ERROR: CpuFeature not supported!\n");
-                break;
+    ret = HPMwrite(cpu, MSR_DEV, reg, flags);
+    if (ret == 0)
+    {
+        if (print)
+        {
+            PRINT_VALUE(GREEN,enabled);
         }
-        PRINT_VALUE(GREEN,enabled);
-        printf("\n");
-        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
     }
     else
     {
-        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+        if (print)
+        {
+            PRINT_VALUE(RED,failed);
+        }
     }
+    cpuFeatures_update(cpu);
+    return 0;
 }
 
 
-void
-cpuFeatures_disable(int cpu, CpuFeature type)
+int
+cpuFeatures_disable(int cpu, CpuFeature type, int print)
 {
+    int ret;
+    uint64_t flags;
+    uint32_t reg = MSR_IA32_MISC_ENABLE;
+    int newOffsets = 0;
+    if (!IF_FLAG(type))
+    {
+        return 0;
+    }
     if ((cpuid_info.model == NEHALEM) ||
             (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
             (cpuid_info.model == NEHALEM_WESTMERE) ||
             (cpuid_info.model == NEHALEM_WESTMERE_M) ||
             (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
+            (cpuid_info.model == WESTMERE_EX) ||
+            (cpuid_info.model == SANDYBRIDGE) ||
+            (cpuid_info.model == SANDYBRIDGE_EP) ||
+            (cpuid_info.model == IVYBRIDGE) ||
+            (cpuid_info.model == IVYBRIDGE_EP) ||
+            (cpuid_info.model == HASWELL) ||
+            (cpuid_info.model == HASWELL_M1) ||
+            (cpuid_info.model == HASWELL_M2) ||
+            (cpuid_info.model == HASWELL_EP) ||
+            (cpuid_info.model == BROADWELL) ||
+            (cpuid_info.model == BROADWELL_D) ||
+            (cpuid_info.model == BROADWELL_E) ||
+            (cpuid_info.model == SKYLAKE1) ||
+            (cpuid_info.model == SKYLAKE2))
     {
-        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
-        switch ( type )
-        {
-            case HW_PREFETCHER:
+        reg = MSR_PREFETCH_ENABLE;
+        newOffsets = 1;
+    }
+    ret = HPMread(cpu, MSR_DEV, reg, &flags);
+    if (ret != 0)
+    {
+        fprintf(stderr, "Reading register 0x%X on CPU %d failed\n", reg, cpu);
+        return ret;
+    }
+    ret = 0;
+    switch ( type )
+    {
+        case FEAT_HW_PREFETCHER:
+            if (print)
                 printf("HW_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags |= (1ULL<<0);
+            }
+            else
+            {
                 flags |= (1ULL<<9);
-                break;
+            }
+            break;
 
-            case CL_PREFETCHER:
+        case FEAT_CL_PREFETCHER:
+            if (print)
                 printf("CL_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags |= (1ULL<<1);
+            }
+            else
+            {
                 flags |= (1ULL<<19);
-                break;
+            }
+            break;
 
-            case DCU_PREFETCHER:
+        case FEAT_DCU_PREFETCHER:
+            if (print)
                 printf("DCU_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags |= (1ULL<<2);
+            }
+            else
+            {
                 flags |= (1ULL<<37);
-                break;
+            }
+            break;
 
-            case IP_PREFETCHER:
+        case FEAT_IP_PREFETCHER:
+            if (print)
                 printf("IP_PREFETCHER:\t");
+            if (newOffsets)
+            {
+                flags |= (1ULL<<3);
+            }
+            else
+            {
                 flags |= (1ULL<<39);
-                break;
+            }
+            break;
 
-            default:
-                printf("ERROR: CpuFeature not supported!\n");
-                break;
-        }
-        PRINT_VALUE(RED,disabled);
-        printf("\n");
+        default:
+            printf("ERROR: Processor feature '%s' cannot be disabled!\n", cpuFeatureNames[type]);
+            ret = -EINVAL;
+            break;
+    }
+    if (ret != 0)
+    {
+        return ret;
+    }
 
-        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
+    ret = HPMwrite(cpu, MSR_DEV, reg, flags);
+    if (ret != 0)
+    {
+        if (print)
+        {
+            PRINT_VALUE(RED,failed);
+        }
+        ret = -EFAULT;
     }
     else
     {
-        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+        if (print)
+        {
+            PRINT_VALUE(RED,disabled);
+        }
+        ret = 0;
     }
+    cpuFeatures_update(cpu);
+    return ret;
 }
 
+int cpuFeatures_get(int cpu, CpuFeature type)
+{
+    if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
+    {
+        if (IF_FLAG(type))
+        {
+            return TRUE;
+        }
+        else
+        {
+            return FALSE;
+        }
+    }
+    return -EINVAL;
+}
+
+char* cpuFeatures_name(CpuFeature type)
+{
+    if ((type >= FEAT_HW_PREFETCHER) && (type < CPUFEATURES_MAX))
+    {
+        return cpuFeatureNames[type];
+    }
+    return NULL;
+}
diff --git a/src/cpuid.c b/src/cpuid.c
deleted file mode 100644
index 6a9ac47..0000000
--- a/src/cpuid.c
+++ /dev/null
@@ -1,1244 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  cpuid.c
- *
- *      Description:  Implementation of cpuid module.
- *                  Provides API to extract cpuid info on x86 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <sched.h>
-#include <time.h>
-#include <math.h>
-
-#include <error.h>
-#include <cpuid.h>
-#include <tree.h>
-#include <bitUtil.h>
-#include <strUtil.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-CpuInfo cpuid_info;
-CpuTopology cpuid_topology;
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int largest_function = 0;
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-/* this was taken from the linux kernel */
-#define CPUID                              \
-    __asm__ volatile ("cpuid"                             \
-            : "=a" (eax),     \
-            "=b" (ebx),     \
-            "=c" (ecx),     \
-            "=d" (edx)      \
-            : "0" (eax), "2" (ecx))
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static char* pentium_m_b_str = "Intel Pentium M Banias processor";
-static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
-static char* core_duo_str = "Intel Core Duo processor";
-static char* core_2a_str = "Intel Core 2 65nm processor";
-static char* core_2b_str = "Intel Core 2 45nm processor";
-static char* atom_45_str = "Intel Atom 45nm processor";
-static char* atom_32_str = "Intel Atom 32nm processor";
-static char* atom_22_str = "Intel Atom 22nm processor";
-static char* atom_silvermont_str = "Intel Atom (Silvermont) 22nm processor";
-static char* atom_saltwell_str = "Intel Atom (Saltwell) 32nm processor";
-static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
-static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
-static char* nehalem_west_str = "Intel Core Westmere processor";
-static char* sandybridge_str = "Intel Core SandyBridge processor";
-static char* ivybridge_str = "Intel Core IvyBridge processor";
-static char* ivybridge_ep_str = "Intel Core IvyBridge EP processor";
-static char* sandybridge_ep_str = "Intel Core SandyBridge EP processor";
-static char* haswell_str = "Intel Core Haswell processor";
-static char* haswell_ex_str = "Intel Core Haswell EX processor";
-static char* nehalem_ex_str = "Intel Nehalem EX processor";
-static char* westmere_ex_str = "Intel Westmere EX processor";
-static char* xeon_mp_string = "Intel Xeon MP processor";
-static char* xeon_phi_string = "Intel Xeon Phi Coprocessor";
-static char* barcelona_str = "AMD Barcelona processor";
-static char* shanghai_str = "AMD Shanghai processor";
-static char* istanbul_str = "AMD Istanbul processor";
-static char* magnycours_str = "AMD Magny Cours processor";
-static char* interlagos_str = "AMD Interlagos processor";
-static char* kabini_str = "AMD Family 16 model - Kabini processor";
-static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
-static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
-static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
-static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
-static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
-static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
-static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
-static char* amd_k8_str = "AMD K8 architecture";
-static char* unknown_intel_str = "Unknown Intel Processor";
-static char* unknown_amd_str = "Unknown AMD Processor";
-
-static volatile int init = 0;
-static uint32_t eax, ebx, ecx, edx;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static void initTopology(FILE* file)
-{
-    size_t items;
-    HWThread* hwThreadPool;
-    CacheLevel* cacheLevels;
-    TreeNode* currentNode;
-
-    items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
-    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
-    items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
-    cpuid_topology.threadPool = hwThreadPool;
-
-    cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
-    items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
-    cpuid_topology.cacheLevels = cacheLevels;
-    cpuid_topology.topologyTree = NULL;
-
-    tree_init(&cpuid_topology.topologyTree, 0);
-
-    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-    {
-        if (!tree_nodeExists(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId))
-        {
-            tree_insertNode(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId);
-        }
-        currentNode = tree_getNode(cpuid_topology.topologyTree,
-                hwThreadPool[i].packageId);
-
-        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
-        {
-            tree_insertNode(currentNode, hwThreadPool[i].coreId);
-        }
-        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
-        if (!tree_nodeExists(currentNode, i))
-        {
-            tree_insertNode(currentNode, i);
-        }
-    }
-}
-
-static uint32_t amdGetAssociativity(uint32_t flag)
-{
-    uint32_t asso= 0;
-
-    switch ( flag )
-    {
-        case 0x0:
-            asso = 0;
-            break;
-
-        case 0x1:
-            asso = 1;
-            break;
-
-        case 0x2:
-            asso = 2;
-            break;
-
-        case 0x4:
-            asso = 4;
-            break;
-
-        case 0x6:
-            asso = 8;
-            break;
-
-        case 0x8:
-            asso = 16;
-            break;
-
-        case 0xA:
-            asso = 32;
-            break;
-
-        case 0xB:
-            asso = 48;
-            break;
-
-        case 0xC:
-            asso = 64;
-            break;
-
-        case 0xD:
-            asso = 96;
-            break;
-
-        case 0xE:
-            asso = 128;
-            break;
-
-        case 0xF:
-            asso = 0;
-            break;
-
-        default:
-            break;
-    }
-    return asso;
-
-}
-
-static int intelCpuidFunc_4(CacheLevel** cachePool)
-{
-    int i;
-    int level=0;
-    int maxNumLevels=0;
-    uint32_t valid=1;
-    CacheLevel* pool;
-    int threadsPerCpu = 0;
-    int numThreadsPerSocket = cpuid_topology.numCoresPerSocket *
-                              cpuid_topology.numThreadsPerCore;
-
-    while (valid)
-    {
-        eax = 0x04;
-        ecx = level;
-        CPUID;
-        valid = extractBitField(eax,5,0);
-        if (!valid)
-        {
-            break;
-        }
-        level++;
-    }
-
-    maxNumLevels = level;
-    *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-    pool = *cachePool;
-
-    for (i=0; i < maxNumLevels; i++) 
-    {
-        eax = 0x04;
-        ecx = i;
-        CPUID;
-
-        pool[i].level = extractBitField(eax,3,5);
-        pool[i].type = (CacheType) extractBitField(eax,5,0);
-        pool[i].associativity = extractBitField(ebx,8,22)+1;
-        pool[i].sets = ecx+1;
-        pool[i].lineSize = extractBitField(ebx,12,0)+1;
-        pool[i].size = pool[i].sets *
-            pool[i].associativity *
-            pool[i].lineSize;
-        pool[i].threads = extractBitField(eax,10,14)+1;
-        pool[i].inclusive = edx&0x2;
-
-        /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
-         * turned off */
-        if (i < 3)
-        {
-            if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-                    (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-                    (cpuid_info.model == NEHALEM_WESTMERE) ||
-                    (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-                    (cpuid_info.model == SANDYBRIDGE) ||
-                    (cpuid_info.model == SANDYBRIDGE_EP) ||
-                    (cpuid_info.model == IVYBRIDGE) ||
-                    (cpuid_info.model == IVYBRIDGE_EP) ||
-                    (cpuid_info.model == HASWELL) ||
-                    (cpuid_info.model == HASWELL_EX) ||
-                    (cpuid_info.model == HASWELL_M1) ||
-                    (cpuid_info.model == HASWELL_M2) ||
-                    (cpuid_info.model == WESTMERE_EX) ||
-                    (cpuid_info.model == NEHALEM_EX))
-            {
-                if (cpuid_topology.numThreadsPerCore == 1)
-                {
-                    pool[i].threads = 1;
-                }
-            }
-        }
-
-        /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes 
-         * too large in here. 
-         * See Documentation: Threads contains maximum number of threads supported
-         * by the cache.
-         * Limit threads per Socket then to the maximum possible value. If the number
-         * of threads supported by the cache does not divide the threads on the socket
-         * without remainder, the threads are adjusted to fit the multiple caches.
-         */
-        if(pool[i].threads > numThreadsPerSocket)
-        {
-            pool[i].threads = numThreadsPerSocket;
-        }
-        else if (((double)numThreadsPerSocket)/((double)pool[i].threads) != 
-                  (double)(numThreadsPerSocket/pool[i].threads))
-        {
-            pool[i].threads = numThreadsPerSocket/
-                (int)ceil(((double)numThreadsPerSocket)/((double)pool[i].threads));
-        }
-        /* For Intel Silvermont this is not enough. It returns 4 threads and 8 cores
-         * for the L2 cache. But according to the data sheet, each 1MB L2 cache slice 
-         * is shared by 2 threads/cores.
-         */
-        else if (pool[i].level == 2 && 
-                ((cpuid_info.model == ATOM_SILVERMONT_C) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_E) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F3)))
-        {
-            pool[i].threads = 2;
-        }
-    }
-
-    
-
-    return maxNumLevels;
-}
-
-static int recheck_numHWThreads()
-{
-    int cpucount = 0;
-    char line[1024];
-    FILE* fp = fopen("/proc/cpuinfo","r");
-    if (fp != NULL)
-    {
-        while( fgets(line,1024,fp) )
-        {
-            if (strncmp(line, "processor", 9) == 0)
-            {
-                cpucount++;
-            }
-        }
-    }
-    return cpucount;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int cpuid_init (void)
-{
-    int isIntel = 1;
-
-    /* FIXME: Race condition??? */
-    if (init) return EXIT_SUCCESS;
-    init =1;
-
-    eax = 0x00;
-    CPUID;
-
-    largest_function = eax;
-    if (ebx == 0x68747541U)
-    {
-        isIntel = 0;
-    }
-
-    eax = 0x01;
-    CPUID;
-    cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
-    cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
-    cpuid_info.stepping =  (eax&0xFU);
-
-    switch ( cpuid_info.family )
-    {
-        case P6_FAMILY:
-            switch ( cpuid_info.model )
-            {
-                case PENTIUM_M_BANIAS:
-                    cpuid_info.name = pentium_m_b_str;
-                    break;
-
-                case PENTIUM_M_DOTHAN:
-                    cpuid_info.name = pentium_m_d_str;
-                    break;
-
-                case CORE_DUO:
-                    cpuid_info.name = core_duo_str;
-                    break;
-
-                case CORE2_65:
-                    cpuid_info.name = core_2a_str;
-                    break;
-
-                case CORE2_45:
-                    cpuid_info.name = core_2b_str;
-                    break;
-
-                case NEHALEM_BLOOMFIELD:
-                    cpuid_info.name = nehalem_bloom_str;
-                    break;
-
-                case NEHALEM_LYNNFIELD:
-                    cpuid_info.name = nehalem_lynn_str;
-                    break;
-
-                case NEHALEM_WESTMERE_M:
-
-                case NEHALEM_WESTMERE:
-                    cpuid_info.name = nehalem_west_str;
-                    break;
-
-                case SANDYBRIDGE:
-                    cpuid_info.name = sandybridge_str;
-                    break;
-
-                case SANDYBRIDGE_EP:
-                    cpuid_info.name = sandybridge_ep_str;
-                    break;
-
-                case IVYBRIDGE:
-                    cpuid_info.name = ivybridge_str;
-                    break;
-
-                case IVYBRIDGE_EP:
-                    cpuid_info.name = ivybridge_ep_str;
-                    break;
-
-                case HASWELL:
-
-                case HASWELL_M1:
-
-                case HASWELL_M2:
-                    cpuid_info.name = haswell_str;
-                    break;
-
-                case HASWELL_EX:
-                    cpuid_info.name = haswell_ex_str;
-                    break;
-
-                case NEHALEM_EX:
-                    cpuid_info.name = nehalem_ex_str;
-                    break;
-
-                case WESTMERE_EX:
-                    cpuid_info.name = westmere_ex_str;
-                    break;
-
-                case XEON_MP:
-                    cpuid_info.name = xeon_mp_string;
-                    break;
-
-                case ATOM_45:
-
-                case ATOM:
-                    cpuid_info.name = atom_45_str;
-                    break;
-
-                case ATOM_32:
-                    cpuid_info.name = atom_32_str;
-                    break;
-
-                case ATOM_22:
-                    cpuid_info.name = atom_22_str;
-                    break;
-
-                case ATOM_SILVERMONT_C:
-                case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    cpuid_info.name = atom_silvermont_str;
-                    break;
-
-                default:
-                    cpuid_info.name = unknown_intel_str;
-                    break;
-            }
-            break;
-
-        case MIC_FAMILY:
-            switch ( cpuid_info.model ) 
-            {
-                case XEON_PHI:
-                    cpuid_info.name = xeon_phi_string;
-                    break;
-
-            }
-            break;
-
-        case K8_FAMILY:
-
-            if (isIntel)
-            {
-                ERROR_PLAIN_PRINT(Netburst architecture is not supported);
-            }
-
-            switch ( cpuid_info.model )
-            {
-                case OPTERON_DC_E:
-                    cpuid_info.name = opteron_dc_e_str;
-                    break;
-
-                case OPTERON_DC_F:
-                    cpuid_info.name = opteron_dc_f_str;
-                    break;
-
-                case ATHLON64_X2:
-
-                case ATHLON64_X2_F:
-                    cpuid_info.name = athlon64_str;
-                    break;
-
-                case ATHLON64_F1:
-
-                case ATHLON64_F2:
-                    cpuid_info.name = athlon64_f_str;
-                    break;
-
-                case ATHLON64_X2_G:
-                    cpuid_info.name = athlon64_X2_g_str;
-                    break;
-
-                case ATHLON64_G1:
-
-                case ATHLON64_G2:
-                    cpuid_info.name = athlon64_g_str;
-                    break;
-
-                case OPTERON_SC_1MB:
-                    cpuid_info.name = opteron_sc_str;
-                    break;
-
-                default:
-                    cpuid_info.name = amd_k8_str;
-                    break;
-            }
-
-            break;
-
-        case K10_FAMILY:
-            switch ( cpuid_info.model )
-            {
-                case BARCELONA:
-                    cpuid_info.name = barcelona_str;
-                    break;
-
-                case SHANGHAI:
-                    cpuid_info.name = shanghai_str;
-                    break;
-
-                case ISTANBUL:
-                    cpuid_info.name = istanbul_str;
-                    break;
-
-                case MAGNYCOURS:
-                    cpuid_info.name = magnycours_str;
-                    break;
-
-                default:
-                    cpuid_info.name = unknown_amd_str;
-                    break;
-            }
-            break;
-
-        case K15_FAMILY:
-            cpuid_info.name = interlagos_str;
-            break;
-
-        case K16_FAMILY:
-            cpuid_info.name = kabini_str;
-            break;
-            
-        default:
-            return EXIT_FAILURE;
-            break;
-    }
-
-    cpuid_info.featureFlags = 0;
-    cpuid_info.features = (char*) malloc(200*sizeof(char));
-    cpuid_info.features[0] = 0;
-    if (ecx & (1<<0))
-    {
-        strcat(cpuid_info.features, "SSE3 ");
-        cpuid_info.featureFlags |= (1<<SSE3);
-    }
-    if (ecx & (1<<3))
-    {
-        strcat(cpuid_info.features, "MONITOR ");
-        cpuid_info.featureFlags |= (1<<MONITOR);
-    }
-    if (ecx & (1<<5))
-    {
-        strcat(cpuid_info.features, "VMX ");
-        cpuid_info.featureFlags |= (1<<VMX);
-    }
-    if (ecx & (1<<7))
-    {
-        strcat(cpuid_info.features, "EIST ");
-        cpuid_info.featureFlags |= (1<<EIST);
-    }
-    if (ecx & (1<<8))
-    {
-        strcat(cpuid_info.features, "TM2 ");
-        cpuid_info.featureFlags |= (1<<TM2);
-    }
-    if (ecx & (1<<9))
-    {
-        strcat(cpuid_info.features, "SSSE3 ");
-        cpuid_info.featureFlags |= (1<<SSSE3);
-    }
-    if (ecx & (1<<12))
-    {
-        strcat(cpuid_info.features, "FMA ");
-        cpuid_info.featureFlags |= (1<<FMA);
-    }
-    if (ecx & (1<<19))
-    {
-        strcat(cpuid_info.features, "SSE4.1 ");
-        cpuid_info.featureFlags |= (1<<SSE41);
-    }
-    if (ecx & (1<<20))
-    {
-        strcat(cpuid_info.features, "SSE4.2 ");
-        cpuid_info.featureFlags |= (1<<SSE42);
-    }
-    if (ecx & (1<<25))
-    {
-        strcat(cpuid_info.features, "AES ");
-        cpuid_info.featureFlags |= (1<<AES);
-    }
-    if (ecx & (1<<28))
-    {
-        strcat(cpuid_info.features, "AVX ");
-        cpuid_info.featureFlags |= (1<<AVX);
-    }
-    if (ecx & (1<<30))
-    {
-        strcat(cpuid_info.features, "RDRAND ");
-        cpuid_info.featureFlags |= (1<<RDRAND);
-    }
-    if (edx & (1<<22))
-    {
-        strcat(cpuid_info.features, "ACPI ");
-        cpuid_info.featureFlags |= (1<<ACPI);
-    }
-    if (edx & (1<<23))
-    {
-        strcat(cpuid_info.features, "MMX ");
-        cpuid_info.featureFlags |= (1<<MMX);
-    }
-    if (edx & (1<<25))
-    {
-        strcat(cpuid_info.features, "SSE ");
-        cpuid_info.featureFlags |= (1<<SSE);
-    }
-    if (edx & (1<<26))
-    {
-        strcat(cpuid_info.features, "SSE2 ");
-        cpuid_info.featureFlags |= (1<<SSE2);
-    }
-    if (edx & (1<<29))
-    {
-        strcat(cpuid_info.features, "TM ");
-        cpuid_info.featureFlags |= (1<<TM);
-    }
-
-    eax = 0x80000001;
-    CPUID;
-    if (edx & (1<<27))
-    {
-        strcat(cpuid_info.features, "RDTSCP ");
-        cpuid_info.featureFlags |= (1<<RDTSCP);
-    }
-
-    cpuid_info.perf_version   =  0;
-    if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
-    {
-        eax = 0x0A;
-        CPUID;
-        cpuid_info.perf_version   =  (eax&0xFFU);
-        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
-        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
-        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
-
-        eax = 0x06;
-        CPUID;
-        if (eax & (1<<1))
-        {
-            cpuid_info.turbo = 1;
-        }
-        else
-        {
-            cpuid_info.turbo = 0;
-        }
-    }
-
-    FILE *file;
-    char *filepath = TOSTRING(CFGFILE);
-
-    if ((file = fopen(filepath, "rb")) != NULL) 
-    {
-        //printf("Read config from file\n");
-        initTopology(file);
-        fclose(file);
-    }
-    else
-    {
-        cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
-        if (recheck_numHWThreads() != cpuid_topology.numHWThreads)
-        {
-            cpuid_topology.numHWThreads = recheck_numHWThreads();
-        }
-        cpu_set_t cpuSet;
-        CPU_ZERO(&cpuSet);
-        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
-        cpuid_initTopology();
-        cpuid_initCacheTopology();
-
-        /* restore affinity mask of process */
-        sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
-    }
-
-    return EXIT_SUCCESS;
-}
-
-void cpuid_print (void)
-{
-    printf("\nSupported Intel processors:\n");
-    printf("\t%s\n",core_2a_str);
-    printf("\t%s\n",core_2b_str);
-    printf("\t%s\n",xeon_mp_string);
-    printf("\t%s\n",atom_45_str);
-    printf("\t%s\n",atom_32_str);
-    printf("\t%s\n",atom_22_str);
-    printf("\t%s\n",nehalem_bloom_str);
-    printf("\t%s\n",nehalem_lynn_str);
-    printf("\t%s\n",nehalem_west_str);
-    printf("\t%s (with Uncore support)\n",nehalem_ex_str);
-    printf("\t%s (with Uncore support)\n",westmere_ex_str);
-    printf("\t%s\n",sandybridge_str);
-    printf("\t%s (with Uncore support)\n",sandybridge_ep_str);
-    printf("\t%s\n",ivybridge_str);
-    printf("\t%s (with Uncore support)\n",ivybridge_ep_str);
-    printf("\t%s (with Uncore support)\n",haswell_str);
-    printf("\t%s (no Uncore support)\n",haswell_ex_str);
-    printf("\t%s\n",atom_silvermont_str);
-    printf("\t%s\n",atom_saltwell_str);
-    printf("\t%s\n\n",xeon_phi_string);
-
-    printf("Supported AMD processors:\n");
-    printf("\t%s\n",opteron_sc_str);
-    printf("\t%s\n",opteron_dc_e_str);
-    printf("\t%s\n",opteron_dc_f_str);
-    printf("\t%s\n",barcelona_str);
-    printf("\t%s\n",shanghai_str);
-    printf("\t%s\n",istanbul_str);
-    printf("\t%s\n",magnycours_str);
-    printf("\t%s\n",interlagos_str);
-    printf("\t%s\n\n",kabini_str);
-}
-
-
-
-
-
-#define freeStrings  \
-    bdestroy(filename);  \
-bdestroy(grepString); \
-bdestroy(cpulist)
-
-
-int cpuid_isInCpuset(void)
-{
-    int pos = 0;
-    bstring grepString = bformat("Cpus_allowed_list:");
-    bstring filename = bformat("/proc/%d/status",getpid());
-    FILE* fp = fopen(bdata(filename),"r");
-
-    if (fp == NULL)
-    {
-        bdestroy(filename);
-        bdestroy(grepString);
-        return 0;
-    } 
-    else
-    {
-        bstring  cpulist;
-        uint32_t tmpThreads[MAX_NUM_THREADS];
-        bstring src = bread ((bNread) fread, fp);
-        if ((pos = binstr(src,0,grepString)) != BSTR_ERR)
-        {
-            int end = bstrchrp(src, 10, pos);
-            pos = pos+blength(grepString);
-            cpulist = bmidstr(src,pos, end-pos);
-            btrimws(cpulist);
-
-            if (bstr_to_cpuset_physical(tmpThreads, cpulist) < cpuid_topology.numHWThreads)
-            {
-                freeStrings;
-                return 1;
-            }
-            else
-            {
-                freeStrings;
-                return 0;
-            }
-        }
-        return 0;
-    }
-}
-
-void cpuid_initTopology(void)
-{
-    uint32_t apicId;
-    uint32_t bitField;
-    int level;
-    int prevOffset = 0;
-    int currOffset = 0;
-    cpu_set_t set;
-    HWThread* hwThreadPool;
-    int hasBLeaf = 0;
-    int maxNumLogicalProcs;
-    int maxNumLogicalProcsPerCore;
-    int maxNumCores;
-    TreeNode* currentNode;
-    int width;
-
-    /* check if 0x0B cpuid leaf is supported */
-    if (largest_function >= 0x0B)
-    {
-        eax = 0x0B;
-        ecx = 0;
-        CPUID;
-
-        if (ebx)
-        {
-            hasBLeaf = 1;
-        }
-    }
-
-    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
-    tree_init(&cpuid_topology.topologyTree, 0);
-
-    if (hasBLeaf)
-    {
-        for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
-        {
-
-            CPU_ZERO(&set);
-            CPU_SET(i,&set);
-            sched_setaffinity(0, sizeof(cpu_set_t), &set);
-            eax = 0x0B;
-            ecx = 0;
-            CPUID;
-            apicId = edx;
-            hwThreadPool[i].apicId = apicId;
-
-            for (level=0; level < 3; level++)
-            {
-                eax = 0x0B;
-                ecx = level;
-                CPUID;
-                currOffset = eax&0xFU;
-
-                switch ( level ) {
-                    case 0:  /* SMT thread */
-                        bitField = extractBitField(apicId,
-                                currOffset,
-                                0);
-                        hwThreadPool[i].threadId = bitField;
-                        break;
-
-                    case 1:  /* Core */
-                        bitField = extractBitField(apicId,
-                                currOffset-prevOffset,
-                                prevOffset);
-                        hwThreadPool[i].coreId = bitField;
-                        break;
-
-                    case 2:  /* Package */
-                        bitField = extractBitField(apicId,
-                                32-prevOffset,
-                                prevOffset);
-                        hwThreadPool[i].packageId = bitField;
-                        break;
-
-                }
-                prevOffset = currOffset;
-            }
-        }
-    }
-    else
-    {
-        switch ( cpuid_info.family )
-        {
-
-            case MIC_FAMILY:
-
-            case P6_FAMILY:
-                eax = 0x01;
-                CPUID;
-                maxNumLogicalProcs = extractBitField(ebx,8,16);
-
-                /* Check number of cores per package */
-                eax = 0x04;
-                ecx = 0;
-                CPUID;
-                maxNumCores = extractBitField(eax,6,26)+1;
-
-                maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
-                    /* ThreadId is extracted from th apicId using the bit width
-                     * of the number of logical processors
-                     * */
-                    hwThreadPool[i].threadId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
-
-                    /* CoreId is extracted from th apicId using the bitWidth 
-                     * of the number of logical processors as offset and the
-                     * bit width of the number of cores as width
-                     * */
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumCores),
-                                getBitFieldWidth(maxNumLogicalProcsPerCore)); 
-
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                8-getBitFieldWidth(maxNumLogicalProcs),
-                                getBitFieldWidth(maxNumLogicalProcs)); 
-                }
-                break;
-
-            case K8_FAMILY:
-                /* AMD Bios manual Rev. 2.28 section 3.1
-                 * Legacy method */
-                /*FIXME: This is a bit of a hack */
-
-                maxNumLogicalProcsPerCore = 1;
-                maxNumLogicalProcs = 1;
-
-                eax = 0x80000008;
-                CPUID;
-
-                maxNumCores =  extractBitField(ecx,8,0)+1;
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
-                    /* ThreadId is extracted from th apicId using the bit width
-                     * of the number of logical processors
-                     * */
-                    hwThreadPool[i].threadId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
-
-                    /* CoreId is extracted from th apicId using the bitWidth 
-                     * of the number of logical processors as offset and the
-                     * bit width of the number of cores as width
-                     * */
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumCores),
-                                0); 
-
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                8-getBitFieldWidth(maxNumCores),
-                                getBitFieldWidth(maxNumCores)); 
-                }
-                break;
-
-            case K16_FAMILY:
-
-            case K15_FAMILY:
-
-            case K10_FAMILY:
-                /* AMD Bios manual Rev. 2.28 section 3.2
-                 * Extended method */
-                eax = 0x80000008;
-                CPUID;
-
-                width =  extractBitField(ecx,4,12);
-
-                if (width == 0)
-                {
-                    width =  extractBitField(ecx,8,0)+1;
-                }
-
-                eax = 0x01;
-                CPUID;
-                maxNumLogicalProcs =  extractBitField(ebx,8,16);
-                maxNumCores = extractBitField(ecx,8,0)+1;
-
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-                    /* AMD only knows cores */
-                    hwThreadPool[i].threadId = 0;
-
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                width, 0); 
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                (8-width), width); 
-                }
-
-                break;
-        }
-    }
-
-    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-    {
-        /* Add node to Topology tree */
-        if (!tree_nodeExists(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId))
-        {
-            tree_insertNode(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId);
-        }
-        currentNode = tree_getNode(cpuid_topology.topologyTree,
-                hwThreadPool[i].packageId);
-
-        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
-        {
-            tree_insertNode(currentNode, hwThreadPool[i].coreId);
-        }
-        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
-        if (!tree_nodeExists(currentNode, i))
-        {
-            /*
-               printf("WARNING: Thread already exists!\n");
-               */
-            tree_insertNode(currentNode, i);
-        }
-
-    }
-
-    cpuid_topology.threadPool = hwThreadPool;
-    cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
-    currentNode = tree_getChildNode(cpuid_topology.topologyTree);
-    cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
-    currentNode = tree_getChildNode(currentNode);
-    cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
-}
-
-void cpuid_initCacheTopology()
-{
-    int maxNumLevels=0;
-    int id=0;
-    CacheLevel* cachePool = NULL;
-    CacheType type = DATACACHE;
-
-    switch ( cpuid_info.family ) 
-    {
-        case MIC_FAMILY:
-
-        case P6_FAMILY:
-
-            if (largest_function >= 4)
-            {
-                maxNumLevels = intelCpuidFunc_4(&cachePool);
-            }
-            else
-            {
-                //				intelCpuidFunc_2(&cachePool);
-            }
-
-            break;
-
-        case K8_FAMILY:
-            maxNumLevels = 2;
-            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
-            eax = 0x80000005;
-            CPUID;
-            cachePool[0].level = 1;
-            cachePool[0].type = DATACACHE;
-            cachePool[0].associativity = extractBitField(ecx,8,16);
-            cachePool[0].lineSize = extractBitField(ecx,8,0);
-            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[0].sets = cachePool[0].size/
-                    (cachePool[0].associativity * cachePool[0].lineSize);
-            }
-            cachePool[0].threads = 1;
-            cachePool[0].inclusive = 1;
-
-            eax = 0x80000006;
-            CPUID;
-            cachePool[1].level = 2;
-            cachePool[1].type = UNIFIEDCACHE;
-            cachePool[1].associativity = 
-                amdGetAssociativity(extractBitField(ecx,4,12));
-            cachePool[1].lineSize = extractBitField(ecx,8,0);
-            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[1].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-            cachePool[1].threads = 1;
-            cachePool[1].inclusive = 1;
-
-            break;
-
-
-        case K10_FAMILY:
-            /* FIXME: Adds one level for the instruction cache on Intel
-             * This fixes the level for the cores
-             */
-            maxNumLevels = 3;
-            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
-            eax = 0x80000005;
-            CPUID;
-            cachePool[0].level = 1;
-            cachePool[0].type = DATACACHE;
-            cachePool[0].associativity = extractBitField(ecx,8,16);
-            cachePool[0].lineSize = extractBitField(ecx,8,0);
-            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[0].sets = cachePool[0].size/
-                    (cachePool[0].associativity * cachePool[0].lineSize);
-            }
-            cachePool[0].threads = 1;
-            cachePool[0].inclusive = 1;
-
-            eax = 0x80000006;
-            CPUID;
-            cachePool[1].level = 2;
-            cachePool[1].type = UNIFIEDCACHE;
-            cachePool[1].associativity = 
-                amdGetAssociativity(extractBitField(ecx,4,12));
-            cachePool[1].lineSize = extractBitField(ecx,8,0);
-            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[1].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-            cachePool[1].threads = 1;
-            cachePool[1].inclusive = 1;
-
-            cachePool[2].level = 3;
-            cachePool[2].type = UNIFIEDCACHE;
-            cachePool[2].associativity =
-                amdGetAssociativity(extractBitField(edx,4,12));
-            cachePool[2].lineSize = extractBitField(edx,8,0);
-            cachePool[2].size =  (extractBitField(edx,14,18)+1) * 524288;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[2].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-
-            if (cpuid_info.model != MAGNYCOURS)
-            {
-                cachePool[2].threads = cpuid_topology.numCoresPerSocket;
-            }
-            else
-            {
-                cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
-                cachePool[2].size /= 2 ;
-            }
-
-            cachePool[2].inclusive = 1;
-
-            break;
-
-        case K16_FAMILY:
-
-        case K15_FAMILY:
-
-            maxNumLevels = 0;
-            cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
-
-            while (type)
-            {
-                ecx = id;
-                eax = 0x8000001D;
-                CPUID;
-                type = (CacheType) extractBitField(eax,4,0);
-
-                if ((type == DATACACHE) || (type == UNIFIEDCACHE))
-                {
-                    cachePool[maxNumLevels].level =   extractBitField(eax,3,5);
-                    cachePool[maxNumLevels].type = type;
-                    cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
-                    cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
-                    cachePool[maxNumLevels].sets =  extractBitField(ecx,32,0)+1;
-                    cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
-                        cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
-                    cachePool[maxNumLevels].threads =  extractBitField(eax,12,14)+1;
-                    cachePool[maxNumLevels].inclusive =  (edx & (0x1<<1));
-                    maxNumLevels++;
-                }
-                id++;
-            }
-            break;
-
-        default:
-            ERROR_PLAIN_PRINT(Processor is not supported);
-            break;
-    }
-
-    cpuid_topology.numCacheLevels = maxNumLevels;
-    cpuid_topology.cacheLevels = cachePool;
-}
-
-
-
diff --git a/src/cpustring.c b/src/cpustring.c
new file mode 100644
index 0000000..7b57ed0
--- /dev/null
+++ b/src/cpustring.c
@@ -0,0 +1,577 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  cpustring.c
+ *
+ *      Description:  Parser for CPU selection strings
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <likwid.h>
+
+
+static int cpulist_sort(int* incpus, int* outcpus, int length)
+{
+    int insert = 0;
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    if (length <= 0)
+    {
+        return -1;
+    }
+    for (int off=0;off < cpuid_topology->numThreadsPerCore;off++)
+    {
+        for (int i=0; i<length/cpuid_topology->numThreadsPerCore;i++)
+        {
+            outcpus[insert] = incpus[(i*cpuid_topology->numThreadsPerCore)+off];
+            insert++;
+        }
+    }
+    return insert;
+}
+
+static int cpulist_concat(int* cpulist, int startidx, int* addlist, int addlength)
+{
+    int count = 0;
+    if (addlength <= 0)
+    {
+        return 0;
+    }
+    for (int i=startidx;i<(startidx+addlength);i++)
+    {
+        cpulist[i] = addlist[i-startidx];
+        count++;
+    }
+    return count;
+}
+
+static int cpu_in_domain(int domainidx, int cpu)
+{
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    for (int i=0;i<affinity->domains[domainidx].numberOfProcessors; i++)
+    {
+        if (cpu == affinity->domains[domainidx].processorList[i])
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int cpuexpr_to_list(bstring bcpustr, bstring prefix, int* list, int length)
+{
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    struct bstrList* strlist;
+    strlist = bsplit(bcpustr, ',');
+    int oldinsert = 0;
+    int insert = 0;
+    for (int i=0;i < strlist->qty; i++)
+    {
+        bstring newstr = bstrcpy(prefix);
+        bconcat(newstr, strlist->entry[i]);
+        oldinsert = insert;
+        for (int j = 0; j < affinity->numberOfAffinityDomains; j++)
+        {
+            if (bstrcmp(affinity->domains[j].tag, newstr) == 0)
+            {
+                list[insert] = atoi(bdata(strlist->entry[i]));
+                insert++;
+                if (insert == length)
+                    goto list_done;
+                break;
+            }
+        }
+        if (insert == oldinsert)
+        {
+            fprintf(stderr,"Domain %s cannot be found\n", bdata(newstr));
+        }
+        bdestroy(newstr);
+    }
+list_done:
+    bstrListDestroy(strlist);
+    return insert;
+}
+
+static int cpustr_to_cpulist_scatter(bstring bcpustr, int* cpulist, int length)
+{
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    char* cpustring = bstr2cstr(bcpustr, '\0');
+    if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR)
+    {
+        int insert = 0;
+        int suitidx = 0;
+        int* suitable = (int*)malloc(affinity->numberOfAffinityDomains*sizeof(int));
+        if (!suitable)
+        {
+            bcstrfree(cpustring);
+            return -ENOMEM;
+        }
+        for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+        {
+            if (bstrchrp(affinity->domains[i].tag, cpustring[0], 0) != BSTR_ERR)
+            {
+                suitable[suitidx] = i;
+                suitidx++;
+            }
+        }
+        int* sortedList = (int*) malloc(affinity->domains[suitable[0]].numberOfProcessors * sizeof(int));
+        if (!sortedList)
+        {
+            free(suitable);
+            bcstrfree(cpustring);
+            return -ENOMEM;
+        }
+        for (int off=0;off<affinity->domains[suitable[0]].numberOfProcessors;off++)
+        {
+            for(int i=0;i < suitidx; i++)
+            {
+                cpulist_sort(affinity->domains[suitable[i]].processorList, sortedList, affinity->domains[suitable[i]].numberOfProcessors);
+                cpulist[insert] = sortedList[off];
+                insert++;
+                if (insert == length)
+                    goto scatter_done;
+            }
+        }
+scatter_done:
+        bcstrfree(cpustring);
+        free(sortedList);
+        free(suitable);
+        return insert;
+    }
+    bcstrfree(cpustring);
+    return 0;
+}
+
+static int cpustr_to_cpulist_expression(bstring bcpustr, int* cpulist, int length)
+{
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    bstring bdomain;
+    int domainidx = -1;
+    int count = 0;
+    int stride = 0;
+    int chunk = 0;
+    if (bstrchrp(bcpustr, 'E', 0) != 0)
+    {
+        fprintf(stderr, "Not a valid CPU expression\n");
+        return 0;
+    }
+    struct bstrList* strlist;
+    strlist = bsplit(bcpustr, ':');
+    if (strlist->qty == 3)
+    {
+        bdomain = bstrcpy(strlist->entry[1]);
+        count = atoi(bdata(strlist->entry[2]));
+        stride = 1;
+        chunk = 1;
+    }
+    else if (strlist->qty == 5)
+    {
+        bdomain = bstrcpy(strlist->entry[1]);
+        count = atoi(bdata(strlist->entry[2]));
+        chunk = atoi(bdata(strlist->entry[3]));
+        stride = atoi(bdata(strlist->entry[4]));
+    }
+    for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+    {
+        if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+        {
+            domainidx = i;
+            break;
+        }
+    }
+    if (domainidx < 0)
+    {
+        fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+        bdestroy(bdomain);
+        bstrListDestroy(strlist);
+        return 0;
+    }
+    int offset = 0;
+    int insert = 0;
+    for (int i=0;i<count;i++)
+    {
+        for (int j=0;j<chunk && offset+j<affinity->domains[domainidx].numberOfProcessors;j++)
+        {
+            cpulist[insert] = affinity->domains[domainidx].processorList[offset + j];
+            insert++;
+            if (insert == length)
+                goto expression_done;
+        }
+        offset += stride;
+        if (offset >= affinity->domains[domainidx].numberOfProcessors)
+        {
+            offset = 0;
+        }
+        if (insert >= count)
+            goto expression_done;
+    }
+    bdestroy(bdomain);
+    bstrListDestroy(strlist);
+    return 0;
+expression_done:
+    bdestroy(bdomain);
+    bstrListDestroy(strlist);
+    return insert;
+}
+
+static int cpustr_to_cpulist_logical(bstring bcpustr, int* cpulist, int length)
+{
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    int domainidx = -1;
+    bstring bdomain;
+    bstring blist;
+    struct bstrList* strlist;
+    if (bstrchrp(bcpustr, 'L', 0) != 0)
+    {
+        fprintf(stderr, "Not a valid CPU expression\n");
+        return 0;
+    }
+
+    strlist = bsplit(bcpustr, ':');
+    if (strlist->qty != 3)
+    {
+        fprintf(stderr, "ERROR: Invalid expression, should look like L:<domain>:<indexlist> or be in a cpuset\n");
+        bstrListDestroy(strlist);
+        return 0;
+    }
+    bdomain = bstrcpy(strlist->entry[1]);
+    blist = bstrcpy(strlist->entry[2]);
+    bstrListDestroy(strlist);
+    for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+    {
+        if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+        {
+            domainidx = i;
+            break;
+        }
+    }
+    if (domainidx < 0)
+    {
+        fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+        bdestroy(bdomain);
+        bdestroy(blist);
+        return 0;
+    }
+    int *inlist = malloc(affinity->domains[domainidx].numberOfProcessors * sizeof(int));
+    if (inlist == NULL)
+    {
+        bdestroy(bdomain);
+        bdestroy(blist);
+        return -ENOMEM;
+    }
+    int ret = cpulist_sort(affinity->domains[domainidx].processorList, inlist, affinity->domains[domainidx].numberOfProcessors);
+
+    strlist = bsplit(blist, ',');
+    int insert = 0;
+    for (int i=0; i< strlist->qty; i++)
+    {
+        if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR)
+        {
+            struct bstrList* indexlist;
+            indexlist = bsplit(strlist->entry[i], '-');
+            if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1])))
+            {
+                for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++)
+                {
+                    cpulist[insert] = inlist[j];
+                    insert++;
+                    if (insert == length)
+                    {
+                        bstrListDestroy(indexlist);
+                        goto logical_done;
+                    }
+                }
+            }
+            else
+            {
+                for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--)
+                {
+                    cpulist[insert] = inlist[j];
+                    insert++;
+                    if (insert == length)
+                    {
+                        bstrListDestroy(indexlist);
+                        goto logical_done;
+                    }
+                }
+            }
+            bstrListDestroy(indexlist);
+        }
+        else
+        {
+            cpulist[insert] = inlist[atoi(bdata(strlist->entry[i]))];
+            insert++;
+            if (insert == length)
+            {
+                goto logical_done;
+            }
+        }
+    }
+logical_done:
+    bdestroy(bdomain);
+    bdestroy(blist);
+    bstrListDestroy(strlist);
+    free(inlist);
+    return insert;
+}
+
+
+
+static int cpustr_to_cpulist_physical(bstring bcpustr, int* cpulist, int length)
+{
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    affinity_init();
+    AffinityDomains_t affinity = get_affinityDomains();
+    bstring bdomain;
+    bstring blist;
+    int domainidx = -1;
+    struct bstrList* strlist;
+    if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR)
+    {
+        strlist = bsplit(bcpustr, ':');
+        bdomain = bstrcpy(strlist->entry[0]);
+        blist = bstrcpy(strlist->entry[1]);
+        bstrListDestroy(strlist);
+    }
+    else
+    {
+        bdomain = bformat("N");
+        blist = bstrcpy(bcpustr);
+    }
+    for (int i=0; i<affinity->numberOfAffinityDomains; i++)
+    {
+        if (bstrcmp(bdomain, affinity->domains[i].tag) == 0)
+        {
+            domainidx = i;
+            break;
+        }
+    }
+    if (domainidx < 0)
+    {
+        fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain));
+        bdestroy(bdomain);
+        bdestroy(blist);
+        return 0;
+    }
+
+    strlist = bsplit(blist, ',');
+    int insert = 0;
+    for (int i=0;i< strlist->qty; i++)
+    {
+        if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR)
+        {
+            struct bstrList* indexlist;
+            indexlist = bsplit(strlist->entry[i], '-');
+            if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1])))
+            {
+                for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++)
+                {
+                    if (cpu_in_domain(domainidx, j))
+                    {
+                        cpulist[insert] = j;
+                        insert++;
+                        if (insert == length)
+                        {
+                            bstrListDestroy(indexlist);
+                            goto physical_done;
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag));
+                    }
+                }
+            }
+            else
+            {
+                for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--)
+                {
+                    if (cpu_in_domain(domainidx, j))
+                    {
+                        cpulist[insert] = j;
+                        insert++;
+                        if (insert == length)
+                        {
+                            bstrListDestroy(indexlist);
+                            goto physical_done;
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag));
+                    }
+                }
+            }
+            bstrListDestroy(indexlist);
+        }
+        else
+        {
+            int cpu = atoi(bdata(strlist->entry[i]));
+            if (cpu_in_domain(domainidx, cpu))
+            {
+                cpulist[insert] = cpu;
+                insert++;
+                if (insert == length)
+                {
+                    goto physical_done;
+                }
+            }
+            else
+            {
+                fprintf(stderr, "CPU %d not in domain %s\n", cpu, bdata(affinity->domains[domainidx].tag));
+            }
+        }
+    }
+physical_done:
+    bstrListDestroy(strlist);
+    bdestroy(bdomain);
+    bdestroy(blist);
+    return insert;
+}
+
+int cpustr_to_cpulist(char* cpustring, int* cpulist, int length)
+{
+    int insert = 0;
+    int len = 0;
+    int ret = 0;
+    bstring bcpustr = bfromcstr(cpustring);
+    struct bstrList* strlist;
+    bstring scattercheck = bformat("scatter");
+    topology_init();
+    CpuTopology_t cpuid_topology = get_cpuTopology();
+    strlist = bsplit(bcpustr, '@');
+
+    int* tmpList = (int*)malloc(length * sizeof(int));
+    if (tmpList == NULL)
+    {
+        bstrListDestroy(strlist);
+        bdestroy(scattercheck);
+        bdestroy(bcpustr);
+        return -ENOMEM;
+    }
+    for (int i=0; i< strlist->qty; i++)
+    {
+        if (binstr(strlist->entry[i], 0, scattercheck) != BSTR_ERR)
+        {
+            ret = cpustr_to_cpulist_scatter(strlist->entry[i], tmpList, length);
+            insert += cpulist_concat(cpulist, insert, tmpList, ret);
+        }
+        else if (bstrchrp(strlist->entry[i], 'E', 0) == 0)
+        {
+            ret = cpustr_to_cpulist_expression(strlist->entry[i], tmpList, length);
+            insert += cpulist_concat(cpulist, insert, tmpList, ret);
+        }
+        else if (bstrchrp(strlist->entry[i], 'L', 0) == 0)
+        {
+            ret = cpustr_to_cpulist_logical(strlist->entry[i], tmpList, length);
+            insert += cpulist_concat(cpulist, insert, tmpList, ret);
+        }
+        else if (cpuid_topology->activeHWThreads < cpuid_topology->numHWThreads)
+        {
+            fprintf(stdout, "INFO: You are running LIKWID in a cpuset with %d CPUs, only logical numbering allowed\n", cpuid_topology->activeHWThreads);
+            if (((bstrchrp(strlist->entry[i], 'N', 0) == 0) ||
+                (bstrchrp(strlist->entry[i], 'S', 0) == 0) ||
+                (bstrchrp(strlist->entry[i], 'C', 0) == 0) ||
+                (bstrchrp(strlist->entry[i], 'M', 0) == 0)) &&
+                (bstrchrp(strlist->entry[i], ':', 0) != BSTR_ERR))
+            {
+                bstring newstr = bformat("L:");
+                bconcat(newstr, strlist->entry[i]);
+                ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+                insert += cpulist_concat(cpulist, insert, tmpList, ret);
+                bdestroy(newstr);
+            }
+            else
+            {
+                bstring newstr = bformat("L:N:");
+                bconcat(newstr, strlist->entry[i]);
+                ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+                insert += cpulist_concat(cpulist, insert, tmpList, ret);
+                bdestroy(newstr);
+            }
+        }
+        else if (((bstrchrp(strlist->entry[i], 'N', 0) == 0) ||
+            (bstrchrp(strlist->entry[i], 'S', 0) == 0) ||
+            (bstrchrp(strlist->entry[i], 'C', 0) == 0) ||
+            (bstrchrp(strlist->entry[i], 'M', 0) == 0)) &&
+            (bstrchrp(strlist->entry[i], ':', 0) != BSTR_ERR))
+        {
+            bstring newstr = bformat("L:");
+            bconcat(newstr, strlist->entry[i]);
+            ret = cpustr_to_cpulist_logical(newstr, tmpList, length);
+            insert += cpulist_concat(cpulist, insert, tmpList, ret);
+            bdestroy(newstr);
+        }
+
+        else
+        {
+            ret = cpustr_to_cpulist_physical(strlist->entry[i], tmpList, length);
+            insert += cpulist_concat(cpulist, insert, tmpList, ret);
+        }
+    }
+    free(tmpList);
+    bdestroy(bcpustr);
+    bdestroy(scattercheck);
+    bstrListDestroy(strlist);
+    return insert;
+}
+
+int nodestr_to_nodelist(char* nodestr, int* nodes, int length)
+{
+    int ret = 0;
+    bstring prefix = bformat("M");
+    bstring bnodestr = bfromcstr(nodestr);
+    ret = cpuexpr_to_list(bnodestr, prefix, nodes, length);
+    bdestroy(bnodestr);
+    bdestroy(prefix);
+    return ret;
+}
+
+int sockstr_to_socklist(char* sockstr, int* sockets, int length)
+{
+    int ret = 0;
+    bstring prefix = bformat("S");
+    bstring bsockstr = bfromcstr(sockstr);
+    ret = cpuexpr_to_list(bsockstr, prefix, sockets, length);
+    bdestroy(bsockstr);
+    bdestroy(prefix);
+    return ret;
+}
diff --git a/src/daemon.c b/src/daemon.c
deleted file mode 100644
index de5bfa5..0000000
--- a/src/daemon.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  daemon.c
- *
- *      Description:  C Module implementing a daemon time loop
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/time.h>
-#include <time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <daemon.h>
-
-static volatile int daemon_run = 0;
-static bstring eventString;
-static TimerData timeData;
-static pid_t daemonpid = 0;
-
-
-void
-daemon_start(bstring str, struct timespec interval)
-{
-    daemonpid = fork();
-    if (daemonpid == 0)
-    {
-        eventString = bstrcpy(str);
-        signal(SIGINT, daemon_interrupt);
-        signal(SIGUSR1, daemon_interrupt);
-        daemon_run = 1;
-        perfmon_setupEventSet(eventString, NULL);
-        perfmon_startCounters();
-        timer_start(&timeData);
-
-        while (1)
-        {
-            if (daemon_run)
-            {
-                timer_stop(&timeData);
-                perfmon_readCounters();
-                perfmon_logCounterResults( timer_print(&timeData) );
-                timer_start(&timeData);
-            }
-            else
-            {
-                break;
-            }
-            nanosleep( &interval, NULL);
-        }
-        signal(SIGINT, SIG_DFL);
-        signal(SIGUSR1, SIG_DFL);
-        exit(EXIT_SUCCESS);
-    }
-}
-
-void
-daemon_stop(int sig)
-{
-    if (daemonpid > 0)
-    {
-        printf("PARENT: KILL daemon with signal %d\n", sig);
-        kill(daemonpid, sig);
-        //perfmon_stopCounters();
-    }
-}
-
-void
-daemon_interrupt(int sig)
-{
-    if (sig == SIGUSR1)
-    {
-        if (daemon_run)
-        {
-            perfmon_stopCounters();
-            daemon_run = 0;
-            printf("DAEMON: STOP on %d\n",sig);
-            exit(EXIT_SUCCESS);
-        }
-        else
-        {
-            perfmon_setupEventSet(eventString, NULL);
-            perfmon_startCounters();
-            daemon_run = 1;
-            printf("DAEMON: START with events %s\n",bdata(eventString));
-        }
-    } else
-    {
-        printf("DAEMON: EXIT on %d\n", sig);
-        daemon_run = 0;
-        exit(EXIT_SUCCESS);
-    }
-}
-
-
diff --git a/src/ghash.c b/src/ghash.c
index 87e0ed0..e385a7b 100644
--- a/src/ghash.c
+++ b/src/ghash.c
@@ -1,19 +1,20 @@
-/*
- * =======================================================================================
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
  */
 
 /*
@@ -42,30 +43,30 @@
 #define HASH_IS_TOMBSTONE(h_) ((h_) == TOMBSTONE_HASH_VALUE)
 #define HASH_IS_REAL(h_) ((h_) >= 2)
 
-#ifndef	FALSE
-#define	FALSE	(0)
+#ifndef    FALSE
+#define    FALSE    (0)
 #endif
 
-#ifndef	TRUE
-#define	TRUE	(!FALSE)
+#ifndef    TRUE
+#define    TRUE    (!FALSE)
 #endif
 
-#undef	MAX
+#undef    MAX
 #define MAX(a, b)  (((a) > (b)) ? (a) : (b))
 
-#undef	MIN
+#undef    MIN
 #define MIN(a, b)  (((a) < (b)) ? (a) : (b))
 
-#undef	ABS
-#define ABS(a)	   (((a) < 0) ? -(a) : (a))
+#undef    ABS
+#define ABS(a)       (((a) < 0) ? -(a) : (a))
 #define G_LIKELY(expr) (expr)
 #define G_UNLIKELY(expr) (expr)
 
 #define _G_NEW(struct_type, n_structs, func) \
         ((struct_type *) g_##func##_n ((n_structs), sizeof (struct_type)))
 
-#define g_new(struct_type, n_structs)			_G_NEW (struct_type, n_structs, malloc)
-#define g_new0(struct_type, n_structs)			_G_NEW (struct_type, n_structs, malloc0)
+#define g_new(struct_type, n_structs)            _G_NEW (struct_type, n_structs, malloc)
+#define g_new0(struct_type, n_structs)            _G_NEW (struct_type, n_structs, malloc0)
 
 struct _GHashTable
 {
@@ -470,7 +471,10 @@ GHashTable *
 g_hash_table_new (GHashFunc  hash_func,
                   GEqualFunc key_equal_func)
 {
-  return g_hash_table_new_full (hash_func, key_equal_func, NULL, NULL);
+  /* Thomas Roehl added g_free as destructor of hash table keys. This reduces
+   * memory leaks since we know that all key strings are duplicated.
+   */
+  return g_hash_table_new_full (hash_func, key_equal_func, g_free, NULL);
 }
 
 
diff --git a/src/hashTable.c b/src/hashTable.c
index bf6c3d8..46c0c66 100644
--- a/src/hashTable.c
+++ b/src/hashTable.c
@@ -6,13 +6,13 @@
  *      Description: Hashtable implementation based on SGLIB.
  *                   Used for Marker API result handling.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -60,6 +60,20 @@ void hashTable_init()
     }
 }
 
+void hashTable_initThread(int coreID)
+{
+    ThreadList* resPtr = threadList[coreID];
+    /* check if thread was already initialized */
+    if (resPtr == NULL)
+    {
+        resPtr = (ThreadList*) malloc(sizeof(ThreadList));
+        /* initialize structure */
+        resPtr->tid =  pthread_self();
+        resPtr->coreId  = coreID;
+        resPtr->hashTable = g_hash_table_new(g_str_hash, g_str_equal);
+        threadList[coreID] = resPtr;
+    }
+}
 
 int hashTable_get(bstring label, LikwidThreadResults** resEntry)
 {
@@ -86,7 +100,7 @@ int hashTable_get(bstring label, LikwidThreadResults** resEntry)
         (*resEntry)->label = bstrcpy (label);
         (*resEntry)->time = 0.0;
         (*resEntry)->count = 0;
-        for (int i=0; i< NUM_PMC; i++) 
+        for (int i=0; i< NUM_PMC; i++)
         {
             (*resEntry)->PMcounters[i] = 0.0;
             (*resEntry)->StartPMcounters[i] = 0.0;
@@ -109,7 +123,6 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
     GHashTable* regionLookup;
 
     regionLookup = g_hash_table_new(g_str_hash, g_str_equal);
-
     /* determine number of active threads */
     for (int i=0; i<MAX_NUM_THREADS; i++)
     {
@@ -128,22 +141,57 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
 
     /* allocate data structures */
     (*results) = (LikwidResults*) malloc(numberOfRegions * sizeof(LikwidResults));
-
-    for ( uint32_t i=0; i < numberOfRegions; i++ )
+    if (!(*results))
     {
-        (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
-        (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
-        (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
-        for ( uint32_t j=0; j < numberOfThreads; j++ )
+        fprintf(stderr, "Failed to allocate %lu bytes for the results\n", numberOfRegions * sizeof(LikwidResults));
+    }
+    else
+    {
+        for ( uint32_t i=0; i < numberOfRegions; i++ )
         {
-            (*results)[i].time[j] = 0.0;
-            (*results)[i].count[j] = 0;
-            (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+            (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
+            if (!(*results)[i].time)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the time storage\n", numberOfThreads * sizeof(double));
+                break;
+            }
+            (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
+            if (!(*results)[i].count)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the count storage\n", numberOfThreads * sizeof(uint32_t));
+                break;
+            }
+            (*results)[i].cpulist = (int*) malloc(numberOfThreads * sizeof(int));
+            if (!(*results)[i].count)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the cpulist storage\n", numberOfThreads * sizeof(int));
+                break;
+            }
+            (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
+            if (!(*results)[i].counters)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage\n", numberOfThreads * sizeof(double*));
+                break;
+            }
 
-            for ( uint32_t k=0; k < NUM_PMC; k++ )
+            for ( uint32_t j=0; j < numberOfThreads; j++ )
             {
-                (*results)[i].counters[j][k] = 0.0;
+                (*results)[i].time[j] = 0.0;
+                (*results)[i].count[j] = 0;
+                (*results)[i].cpulist[j] = -1;
+                (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+                if (!(*results)[i].counters)
+                {
+                    fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage for thread %d\n", NUM_PMC * sizeof(double), j);
+                    break;
+                }
+                else
+                {
+                    for ( uint32_t k=0; k < NUM_PMC; k++ )
+                    {
+                        (*results)[i].counters[j][k] = 0.0;
+                    }
+                }
             }
         }
     }
@@ -174,6 +222,7 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
                 if ( regionId == NULL )
                 {
                     (*results)[currentRegion].tag = bstrcpy (threadResult->label);
+                    (*results)[currentRegion].groupID = threadResult->groupID;
                     regionIds[currentRegion] = currentRegion;
                     regionId = regionIds + currentRegion;
                     g_hash_table_insert(regionLookup, g_strdup(key), (regionIds+currentRegion));
@@ -182,17 +231,24 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
 
                 (*results)[*regionId].count[threadId] = threadResult->count;
                 (*results)[*regionId].time[threadId] = threadResult->time;
+                (*results)[*regionId].cpulist[threadId] = threadResult->cpuID;
 
                 for ( int j=0; j < NUM_PMC; j++ )
                 {
                     (*results)[*regionId].counters[threadId][j] = threadResult->PMcounters[j];
                 }
+                bdestroy(threadResult->label);
+                free(threadResult);
             }
 
             threadId++;
+            g_hash_table_destroy(resPtr->hashTable);
+            free(resPtr);
+            threadList[core] = NULL;
         }
     }
-
+    g_hash_table_destroy(regionLookup);
+    regionLookup = NULL;
     (*numThreads) = numberOfThreads;
     (*numRegions) = numberOfRegions;
 }
diff --git a/src/includes/access.h b/src/includes/access.h
new file mode 100644
index 0000000..b81beb8
--- /dev/null
+++ b/src/includes/access.h
@@ -0,0 +1,44 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access.h
+ *
+ *      Description:  Header File HPM access Module
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ACCESS_H
+#define ACCESS_H
+
+void HPMmode(int mode);
+int HPMinit(void);
+int HPMinitialized(void);
+int HPMaddThread(int cpu_id);
+void HPMfinalize();
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data);
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data);
+int HPMcheck(PciDeviceIndex dev, int cpu_id);
+
+
+#endif
diff --git a/src/includes/accessClient.h b/src/includes/accessClient.h
deleted file mode 100644
index 0058182..0000000
--- a/src/includes/accessClient.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  accessClient.h
- *
- *      Description:  Header File accessClient Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ACCESSCLIENT_H
-#define ACCESSCLIENT_H
-
-#include <types.h>
-
-extern int accessClient_mode;
-
-/* This needs to be called BEFORE msr_init and
- * sets how the module tries to access the MSR registers. */
-extern void accessClient_setaccessmode(int mode);
-
-/* This needs to be called BEFORE msr_init and
- * sets the priority the module reports to the daemon.
- * This is a noop in any msr access mode except sysmsrd. */
-extern void accessClient_setlowaccesspriority(void);
-
-/* Initializes the MSR module, trying to open either the MSR files or
- * the connection to the msr daemon. */
-extern void accessClient_init(int* socket_fd);
-extern void accessClient_initThread(int* socket_fd);
-extern void accessClient_finalize(int socket_fd);
-extern uint64_t accessClient_read(int socket_fd, int cpu, int device, uint32_t reg);
-extern void accessClient_write(int socket_fd, int cpu, int device, uint32_t reg, uint64_t data);
-
-#endif /* ACCESSCLIENT_H */
diff --git a/src/includes/accessClient_types.h b/src/includes/accessClient_types.h
deleted file mode 100644
index a0c7a84..0000000
--- a/src/includes/accessClient_types.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  accessClient_types.h
- *
- *      Description:  Types file for accessClient module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ACCESSCLIENT_TYPES_H
-#define ACCESSCLIENT_TYPES_H
-
-#include <stdint.h>
-
-/* This naming with AccessType and AccessMode is admittedly a bit confusing */
-typedef enum {
-    DAEMON_AM_DIRECT = 0,
-    DAEMON_AM_ACCESS_D
-} AccessMode;
-
-typedef enum {
-    DAEMON_READ = 0,
-    DAEMON_WRITE,
-    DAEMON_EXIT
-} AccessType;
-
-typedef enum {
-    DAEMON_AD_PCI_R3QPI_LINK_0 = 0,
-    DAEMON_AD_PCI_R3QPI_LINK_1,
-    DAEMON_AD_PCI_R2PCIE,
-    DAEMON_AD_PCI_IMC_CH_0,
-    DAEMON_AD_PCI_IMC_CH_1,
-    DAEMON_AD_PCI_IMC_CH_2,
-    DAEMON_AD_PCI_IMC_CH_3,
-    DAEMON_AD_PCI_HA,
-    DAEMON_AD_PCI_QPI_PORT_0,
-    DAEMON_AD_PCI_QPI_PORT_1,
-    DAEMON_AD_PCI_QPI_MASK_PORT_0,
-    DAEMON_AD_PCI_QPI_MASK_PORT_1,
-    DAEMON_AD_PCI_QPI_MISC_PORT_0,
-    DAEMON_AD_PCI_QPI_MISC_PORT_1,
-    DAEMON_AD_MSR
-} AccessDevice;
-
-typedef enum {
-    ERR_NOERROR = 0,  /* no error */
-    ERR_UNKNOWN,      /* unknown command */
-    ERR_RESTREG,      /* attempt to access restricted MSR */
-    ERR_OPENFAIL,     /* failure to open msr files */
-    ERR_RWFAIL,       /* failure to read/write msr */
-    ERR_DAEMONBUSY,   /* daemon already has another client */
-    ERR_LOCKED,       /* access to HPM is locked */
-    ERR_UNSUPPORTED,   /* unsupported processor */
-    ERR_NODEV	/* No such device */
-} AccessErrorType;
-
-typedef struct {
-    uint32_t cpu;
-    uint32_t reg;
-    uint64_t data;
-    AccessDevice device;
-    AccessType type;
-    AccessErrorType errorcode; /* Only in replies - 0 if no error. */
-} AccessDataRecord;
-
-#endif /*ACCESSCLIENT_TYPES_H*/
diff --git a/src/includes/access_client.h b/src/includes/access_client.h
new file mode 100644
index 0000000..46f1dbb
--- /dev/null
+++ b/src/includes/access_client.h
@@ -0,0 +1,11 @@
+#ifndef LIKWID_ACCESS_CLIENT_H
+#define LIKWID_ACCESS_CLIENT_H
+
+
+int access_client_init(int cpu_id);
+int access_client_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data);
+int access_client_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data);
+void access_client_finalize(int cpu_id);
+int access_client_check(PciDeviceIndex dev, int cpu_id);
+
+#endif
diff --git a/src/includes/access_client_types.h b/src/includes/access_client_types.h
new file mode 100644
index 0000000..214aae8
--- /dev/null
+++ b/src/includes/access_client_types.h
@@ -0,0 +1,65 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access_client_types.h
+ *
+ *      Description:  Types file for access_client access module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ACCESSCLIENT_TYPES_H
+#define ACCESSCLIENT_TYPES_H
+
+#include <stdint.h>
+#include <pci_types.h>
+
+typedef enum {
+    DAEMON_READ = 0,
+    DAEMON_WRITE,
+    DAEMON_CHECK,
+    DAEMON_EXIT
+} AccessType;
+
+typedef enum {
+    ERR_NOERROR = 0,  /* no error */
+    ERR_UNKNOWN,      /* unknown command */
+    ERR_RESTREG,      /* attempt to access restricted MSR */
+    ERR_OPENFAIL,     /* failure to open msr files */
+    ERR_RWFAIL,       /* failure to read/write msr */
+    ERR_DAEMONBUSY,   /* daemon already has another client */
+    ERR_NODEV         /* No such device */
+} AccessErrorType;
+
+typedef struct {
+    uint32_t cpu;
+    uint32_t reg;
+    uint64_t data;
+    PciDeviceIndex device;
+    AccessType type;
+    AccessErrorType errorcode; /* Only in replies - 0 if no error. */
+} AccessDataRecord;
+
+extern int accessClient_mode;
+
+#endif /*ACCESSCLIENT_TYPES_H*/
diff --git a/src/includes/access_x86.h b/src/includes/access_x86.h
new file mode 100644
index 0000000..1628bee
--- /dev/null
+++ b/src/includes/access_x86.h
@@ -0,0 +1,13 @@
+#ifndef LIKWID_ACCESS_X86_H
+#define LIKWID_ACCESS_X86_H
+
+#include <types.h>
+
+int access_x86_init(int cpu_id);
+int access_x86_read(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t *data);
+int access_x86_write(PciDeviceIndex dev, const int cpu_id, uint32_t reg, uint64_t data);
+void access_x86_finalize(int cpu_id);
+int access_x86_check(PciDeviceIndex dev, int cpu_id);
+
+
+#endif
diff --git a/src/includes/access_x86_msr.h b/src/includes/access_x86_msr.h
new file mode 100644
index 0000000..a00c45b
--- /dev/null
+++ b/src/includes/access_x86_msr.h
@@ -0,0 +1,12 @@
+#ifndef LIKWID_ACCESS_X86_MSR_H
+#define LIKWID_ACCESS_X86_MSR_H
+
+#include <types.h>
+
+int access_x86_msr_init(const int cpu_id);
+void access_x86_msr_finalize(const int cpu_id);
+int access_x86_msr_read(const int cpu, uint32_t reg, uint64_t *data);
+int access_x86_msr_write(const int cpu, uint32_t reg, uint64_t data);
+int access_x86_msr_check(PciDeviceIndex dev, int cpu_id);
+
+#endif
diff --git a/src/includes/access_x86_pci.h b/src/includes/access_x86_pci.h
new file mode 100644
index 0000000..e932e57
--- /dev/null
+++ b/src/includes/access_x86_pci.h
@@ -0,0 +1,12 @@
+#ifndef LIKWID_ACCESS_X86_PCI_H
+#define LIKWID_ACCESS_X86_PCI_H
+
+#include <types.h>
+
+int access_x86_pci_init(const int socket);
+void access_x86_pci_finalize(const int socket);
+int access_x86_pci_read(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t *data);
+int access_x86_pci_write(PciDeviceIndex dev, const int socket, uint32_t reg, uint64_t data);
+int access_x86_pci_check(PciDeviceIndex dev, int socket);
+
+#endif
diff --git a/src/includes/affinity.h b/src/includes/affinity.h
index f347e64..6f2215c 100644
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header File affinity Module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -32,17 +33,18 @@
 #define AFFINITY_H
 
 #include <types.h>
+#include <likwid.h>
+
+int socket_lock[MAX_NUM_NODES];
+int tile_lock[MAX_NUM_THREADS];
+extern AffinityDomains affinityDomains;
 
 extern int affinity_core2node_lookup[MAX_NUM_THREADS];
 
-extern void affinity_init();
-extern void affinity_finalize();
-extern int  affinity_processGetProcessorId();
-extern int  affinity_threadGetProcessorId();
-extern void  affinity_pinProcess(int processorId);
-extern void  affinity_pinThread(int processorId);
+extern int affinity_processGetProcessorId();
+extern int affinity_threadGetProcessorId();
 extern const AffinityDomain* affinity_getDomain(bstring domain);
-extern void affinity_printDomains(FILE* OUTSTREAM);
+
 
 #endif /*AFFINITY_H*/
 
diff --git a/src/includes/affinity_types.h b/src/includes/affinity_types.h
deleted file mode 100644
index 2b08bfe..0000000
--- a/src/includes/affinity_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  affinity_types.h
- *
- *      Description:  Type Definitions for affinity Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef AFFINITY_TYPES_H
-#define AFFINITY_TYPES_H
-
-typedef struct {
-    bstring tag;
-    uint32_t numberOfProcessors;
-    uint32_t numberOfCores;
-    int* processorList;
-} AffinityDomain;
-
-
-#endif /*AFFINITY_TYPES_H*/
diff --git a/src/includes/allocator.h b/src/includes/allocator.h
deleted file mode 100644
index a21555c..0000000
--- a/src/includes/allocator.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  allocator.h
- *
- *      Description:  Header File allocator Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  none
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ALLOCATOR_H
-#define ALLOCATOR_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern void allocator_init(int numVectors);
-extern void allocator_finalize();
-extern void allocator_allocateVector(FILE* OUTSTREAM,
-                                     void** ptr,
-                                     int alignment,
-                                     uint64_t size,
-                                     int offset,
-                                     DataType type,
-                                     bstring domain);
-
-#endif /*ALLOCATOR_H*/
-
diff --git a/src/includes/asciiBoxes.h b/src/includes/asciiBoxes.h
deleted file mode 100644
index dd37a05..0000000
--- a/src/includes/asciiBoxes.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes.h
- *
- *      Description:  Module to draw nested ascii art boxes.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_H
-#define ASCIIBOXES_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern BoxContainer* asciiBoxes_allocateContainer(int numLines,int numColumns);
-extern void asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label);
-extern void asciiBoxes_addJoinedBox(BoxContainer* container, int line, int startColumn, int endColumn, bstring label);
-extern void asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container);
-
-#endif /*ASCIIBOXES_H*/
diff --git a/src/includes/asciiBoxes_types.h b/src/includes/asciiBoxes_types.h
deleted file mode 100644
index f09c4b3..0000000
--- a/src/includes/asciiBoxes_types.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes_types.h
- *
- *      Description:  Types file for asciiBoxes module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_TYPES_H
-#define ASCIIBOXES_TYPES_H
-
-#include  <bstrlib.h>
-
-typedef struct box {
-    int width;
-    bstring label;
-} Box;
-
-typedef struct boxContainer {
-    int numLines;
-    int numColumns;
-    Box** boxes;
-} BoxContainer;
-
-#endif /*ASCIIBOXES_TYPES_H*/
diff --git a/src/includes/asciiTable.h b/src/includes/asciiTable.h
deleted file mode 100644
index 6096c4a..0000000
--- a/src/includes/asciiTable.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable.h
- *
- *      Description:  Module to create and print a ascii table
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_H
-#define ASCIITABLE_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern TableContainer* asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels);
-extern void asciiTable_free(TableContainer* container);
-extern void asciiTable_insertRow(TableContainer* container, int row,  bstrList* fields);
-extern void asciiTable_appendRow(TableContainer* container, bstrList* fields);
-extern void asciiTable_setCurrentRow(TableContainer* container, int row);
-extern void asciiTable_print(TableContainer* container);
-extern void asciiTable_setOutput(FILE* stream);
-
-#endif /*ASCIITABLE_H*/
diff --git a/src/includes/asciiTable_types.h b/src/includes/asciiTable_types.h
deleted file mode 100644
index 986a8a2..0000000
--- a/src/includes/asciiTable_types.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable_types.h
- *
- *      Description:  Types file for asciiTable module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_TYPES_H
-#define ASCIITABLE_TYPES_H
-
-#include  <bstrlib.h>
-
-typedef struct bstrList bstrList; 
-
-typedef struct {
-    int numRows;
-    int numColumns;
-    int currentRow;
-    int printed;
-    bstrList*  header;
-    bstrList** rows;
-} TableContainer;
-
-
-#endif /*ASCIITABLE_TYPES_H*/
diff --git a/src/includes/barrier.h b/src/includes/barrier.h
deleted file mode 100644
index 5f4142d..0000000
--- a/src/includes/barrier.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier.h
- *
- *      Description:  Header File barrier Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_H
-#define BARRIER_H
-
-#include <types.h>
-
-/**
- * @brief  Initialize the barrier module
- * @param  numberOfThreads The total number of threads in the barrier
- */
-extern void barrier_init(int numberOfGroups);
-
-/**
- * @brief  Destroy data structures of the barrier module
- */
-extern void barrier_destroy(void);
-
-/**
- * @brief  Register a thread for a barrier
- * @param  threadId The id of the thread to register
- */
-extern int barrier_registerGroup(int numThreads);
-extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
-
-/**
- * @brief  Synchronize threads
- * @param  threadId The id of the calling thread
- * @param  numberOfThreads Total number of threads in the barrier
- */
-extern void  barrier_synchronize(BarrierData* barr);
-
-
-#endif /*BARRIER_H*/
diff --git a/src/includes/barrier_types.h b/src/includes/barrier_types.h
deleted file mode 100644
index d0abb55..0000000
--- a/src/includes/barrier_types.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier_types.h
- *
- *      Description:  Type Definitions for barrier Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_TYPES_H
-#define BARRIER_TYPES_H
-
-#include <stdint.h>
-
-typedef struct {
-    int        numberOfThreads;
-    int        offset;
-    int        val;
-    int*       index;
-    volatile int*  bval;
-} BarrierData;
-
-typedef struct {
-    int*       groupBval;
-    int        numberOfThreads;
-} BarrierGroup;
-
-#endif /*BARRIER_TYPES_H*/
diff --git a/src/includes/bitUtil.h b/src/includes/bitUtil.h
index c876eea..e10ad65 100644
--- a/src/includes/bitUtil.h
+++ b/src/includes/bitUtil.h
@@ -6,13 +6,13 @@
  *      Description:  Header File bitUtil Module. 
  *                    Helper routines for dealing with bit manipulations
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/bstrlib.h b/src/includes/bstrlib.h
index abdbef3..a1160b6 100644
--- a/src/includes/bstrlib.h
+++ b/src/includes/bstrlib.h
@@ -113,11 +113,11 @@ extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
 extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
 extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
 extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 
 /* Miscellaneous functions */
 extern int bpattern (bstring b, int len);
@@ -137,21 +137,21 @@ extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
 bstring bstrtmp_b = (b); \
 const char * bstrtmp_fmt = (fmt); \
 int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
-	for (;;) { \
-		va_list bstrtmp_arglist; \
-		va_start (bstrtmp_arglist, lastarg); \
-		bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
-		va_end (bstrtmp_arglist); \
-		if (bstrtmp_r >= 0) { /* Everything went ok */ \
-			bstrtmp_r = BSTR_OK; \
-			break; \
-		} else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
-			bstrtmp_r = BSTR_ERR; \
-			break; \
-		} \
-		bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
-	} \
-	ret = bstrtmp_r; \
+    for (;;) { \
+        va_list bstrtmp_arglist; \
+        va_start (bstrtmp_arglist, lastarg); \
+        bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
+        va_end (bstrtmp_arglist); \
+        if (bstrtmp_r >= 0) { /* Everything went ok */ \
+            bstrtmp_r = BSTR_OK; \
+            break; \
+        } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
+            bstrtmp_r = BSTR_ERR; \
+            break; \
+        } \
+        bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
+    } \
+    ret = bstrtmp_r; \
 }
 
 #endif
@@ -179,15 +179,15 @@ extern int bsreada (bstring b, struct bStream * s, int n);
 extern int bsunread (struct bStream * s, const_bstring b);
 extern int bspeek (bstring r, const struct bStream * s);
 extern int bssplitscb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
 extern int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
 extern int bseof (const struct bStream * s);
 
 struct tagbstring {
-	int mlen;
-	int slen;
-	unsigned char * data;
+    int mlen;
+    int slen;
+    unsigned char * data;
 };
 
 /* Accessor macros */
diff --git a/src/includes/calculator.h b/src/includes/calculator.h
new file mode 100644
index 0000000..67ca564
--- /dev/null
+++ b/src/includes/calculator.h
@@ -0,0 +1,38 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator.h
+ *
+ *      Description:  Header file for infix calculator
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CALCULATOR_H
+#define CALCULATOR_H
+
+
+int calculate_infix(char* finfix, double *result);
+
+#endif
diff --git a/src/includes/calculator_stack.h b/src/includes/calculator_stack.h
new file mode 100644
index 0000000..670f317
--- /dev/null
+++ b/src/includes/calculator_stack.h
@@ -0,0 +1,48 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator_stack.h
+ *
+ *      Description:  Stack implementation for infix calculator
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) Brandon Mills
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CALCULATOR_STACK_H
+#define CALCULATOR_STACK_H
+
+typedef struct
+{
+    void **content;
+    int size;
+    int top;
+} Stack;
+
+void stackInit(Stack *s, int size);
+void stackPush(Stack *s, void* val);
+void* stackTop(Stack *s);
+void* stackPop(Stack *s);
+int stackSize(Stack *s);
+void stackFree(Stack *s);
+
+#endif /* CALCULATOR_STACK_H */
diff --git a/src/includes/configuration.h b/src/includes/configuration.h
new file mode 100644
index 0000000..a6a3334
--- /dev/null
+++ b/src/includes/configuration.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  configuration.h
+ *
+ *      Description:  Header File of Module configuration.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+
+#include <types.h>
+#include <likwid.h>
+#include <error.h>
+
+
+extern Configuration config;
+extern int init_config;
+
+
+
+
+
+#endif
diff --git a/src/includes/cpuFeatures.h b/src/includes/cpuFeatures.h
index 9274e40..af4d7c2 100644
--- a/src/includes/cpuFeatures.h
+++ b/src/includes/cpuFeatures.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header File of Module cpuFeatures.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/cpuFeatures_types.h b/src/includes/cpuFeatures_types.h
index 3e7ec5d..87ed2a2 100644
--- a/src/includes/cpuFeatures_types.h
+++ b/src/includes/cpuFeatures_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for CpuFeature module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -38,23 +38,23 @@ typedef enum {
     IP_PREFETCHER} CpuFeature;
 
 typedef struct {
-    unsigned int fastStrings:1;
-    unsigned int thermalControl:1;
-    unsigned int perfMonitoring:1;
-    unsigned int hardwarePrefetcher:1;
-    unsigned int ferrMultiplex:1;
-    unsigned int branchTraceStorage:1;
-    unsigned int pebs:1;
-    unsigned int speedstep:1;
-    unsigned int monitor:1;
-    unsigned int clPrefetcher:1;
-    unsigned int speedstepLock:1;
-    unsigned int cpuidMaxVal:1;
-    unsigned int xdBit:1;
-    unsigned int dcuPrefetcher:1;
-    unsigned int dynamicAcceleration:1;
-    unsigned int turboMode:1;
-    unsigned int ipPrefetcher:1;
+	unsigned int fastStrings:1;
+	unsigned int thermalControl:1;
+	unsigned int perfMonitoring:1;
+	unsigned int hardwarePrefetcher:1;
+	unsigned int ferrMultiplex:1;
+	unsigned int branchTraceStorage:1;
+	unsigned int pebs:1;
+	unsigned int speedstep:1;
+	unsigned int monitor:1;
+	unsigned int clPrefetcher:1;
+	unsigned int speedstepLock:1;
+	unsigned int cpuidMaxVal:1;
+	unsigned int xdBit:1;
+	unsigned int dcuPrefetcher:1;
+	unsigned int dynamicAcceleration:1;
+	unsigned int turboMode:1;
+	unsigned int ipPrefetcher:1;
     } CpuFeatureFlags;
 
 
diff --git a/src/includes/cpuid.h b/src/includes/cpuid.h
index 80c426a..7970ced 100644
--- a/src/includes/cpuid.h
+++ b/src/includes/cpuid.h
@@ -1,19 +1,17 @@
 /*
  * =======================================================================================
  *
- *      Filename:  cpuid.h
+ *      Filename:  configuration.h
  *
- *      Description:  Header File cpuid Module. 
- *                    Reads out cpuid information and initilaizes a global 
- *                    data structure cpuid_info.
+ *      Description:  Common macro definition for CPUID instruction
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -30,101 +28,32 @@
  * =======================================================================================
  */
 
-#ifndef CPUID_H
-#define CPUID_H
-
-#include <types.h>
-
-/* Intel P6 */
-#define PENTIUM_M_BANIAS     0x09U
-#define PENTIUM_M_DOTHAN     0x0DU
-#define CORE_DUO             0x0EU
-#define CORE2_65             0x0FU
-#define CORE2_45             0x17U
-#define ATOM                 0x1CU
-#define ATOM_45              0x26U
-#define ATOM_32              0x36U
-#define ATOM_22              0x27U
-#define ATOM_SILVERMONT_E    0x37U
-#define ATOM_SILVERMONT_C    0x4DU
-#define ATOM_SILVERMONT_F1   0x4AU
-#define ATOM_SILVERMONT_F2   0x5AU
-#define ATOM_SILVERMONT_F3   0x5DU
-#define NEHALEM              0x1AU
-#define NEHALEM_BLOOMFIELD   0x1AU
-#define NEHALEM_LYNNFIELD    0x1EU
-#define NEHALEM_LYNNFIELD_M  0x1FU
-#define NEHALEM_WESTMERE     0x2CU
-#define NEHALEM_WESTMERE_M   0x25U
-#define SANDYBRIDGE          0x2AU
-#define SANDYBRIDGE_EP       0x2DU
-#define HASWELL              0x3CU
-#define HASWELL_EX           0x3FU
-#define HASWELL_M1           0x45U
-#define HASWELL_M2           0x46U
-#define IVYBRIDGE            0x3AU
-#define IVYBRIDGE_EP         0x3EU
-#define NEHALEM_EX           0x2EU
-#define WESTMERE_EX          0x2FU
-#define XEON_MP              0x1DU
-
-/* Intel MIC */
-#define XEON_PHI           0x01U
-
-/* AMD K10 */
-#define BARCELONA      0x02U
-#define SHANGHAI       0x04U
-#define ISTANBUL       0x08U
-#define MAGNYCOURS     0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB  0x05U
-#define OPTERON_DC_E    0x21U
-#define OPTERON_DC_F    0x41U
-#define ATHLON64_X2     0x43U
-#define ATHLON64_X2_F   0x4BU
-#define ATHLON64_F1     0x4FU
-#define ATHLON64_F2     0x5FU
-#define ATHLON64_X2_G   0x6BU
-#define ATHLON64_G1     0x6FU
-#define ATHLON64_G2     0x7FU
-
-
-#define  P6_FAMILY        0x6U
-#define  MIC_FAMILY       0xBU
-#define  NETBURST_FAMILY  0xFFU
-#define  K15_FAMILY       0x15U
-#define  K16_FAMILY       0x16U
-#define  K10_FAMILY       0x10U
-#define  K8_FAMILY        0xFU
-
-/** Structure holding cpuid information
- *
- */
-extern CpuInfo cpuid_info;
-extern CpuTopology cpuid_topology;
-
-/** Init routine to intialize global structure.
- *
- *  Determines: 
- *  - cpu family
- *  - cpu model
- *  - cpu stepping
- *  - cpu clock
- *  - Instruction Set Extension Flags
- *  - Performance counter features (Intel P6 only)
- *
- */
-extern int cpuid_init (void);
-extern void cpuid_print (void);
-extern void cpuid_initTopology (void);
-extern void cpuid_initCacheTopology (void);
-extern int  cpuid_isInCpuset(void);
-
-static inline int cpuid_hasFeature(FeatureBit bit)
-{
-    return (cpuid_info.featureFlags & (1<<bit));
-}
-
-
-#endif /*CPUID_H*/
+#ifndef LIKWID_CPUID_H
+#define LIKWID_CPUID_H
+
+/* This was taken from the linux kernel
+ * Kernel version 3.19
+ * File: arch/x86/boot/cpuflags.c
+*/
+
+
+#if defined(__i386__) && defined(__PIC__)
+# define EBX_REG "=r"
+#else
+# define EBX_REG "=b"
+#endif
+
+#ifndef __clang__
+#define CPUID(eax,ebx,ecx,edx)                            \
+    __asm__ volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t" \
+                     "cpuid                                     \n\t" \
+                     ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t" \
+                     : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) \
+                     : "a" (eax), "c" (ecx) \
+                     )
+#else
+#define CPUID(eax,ebx,ecx,edx)         \
+    __asm__ volatile("cpuid" : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) : "a" (eax), "c" (ecx) );
+#endif
+
+#endif
diff --git a/src/includes/cpuid_types.h b/src/includes/cpuid_types.h
deleted file mode 100644
index cccc22d..0000000
--- a/src/includes/cpuid_types.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  cpuid_types.h
- *
- *      Description:  Types file for cpuid module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef CPUID_TYPES_H
-#define CPUID_TYPES_H
-
-typedef enum {
-    NOCACHE=0,
-    DATACACHE,
-    INSTRUCTIONCACHE,
-    UNIFIEDCACHE,
-    ITLB,
-    DTLB} CacheType;
-
-typedef enum {
-    NODE=0,
-    SOCKET,
-    CORE,
-    THREAD} NodeLevel;
-
-typedef enum {
-    SSE3=0,
-    VSX,
-    MMX,
-    SSE,
-    SSE2,
-    MONITOR,
-    ACPI,
-    RDTSCP,
-    VMX,
-    EIST,
-    TM,
-    TM2,
-    AES,
-    RDRAND,
-    SSSE3,
-    SSE41,
-    SSE42,
-    AVX,
-    FMA} FeatureBit;
-
-typedef struct {
-    uint32_t family;
-    uint32_t model;
-    uint32_t stepping;
-    uint64_t clock;
-    int      turbo;
-    char*  name;
-    char*  features;
-    uint32_t featureFlags;
-    uint32_t perf_version;
-    uint32_t perf_num_ctr;
-    uint32_t perf_width_ctr;
-    uint32_t perf_num_fixed_ctr;
-    int supportUncore;
-} CpuInfo;
-
-typedef struct {
-    uint32_t threadId;
-    uint32_t coreId;
-    uint32_t packageId;
-    uint32_t apicId;
-} HWThread;
-
-typedef struct {
-    int level;
-    CacheType type;
-    int associativity;
-    int sets;
-    int lineSize;
-    int size;
-    int threads;
-    int inclusive;
-} CacheLevel;
-
-typedef struct {
-    uint32_t numHWThreads;
-    uint32_t numSockets;
-    uint32_t numCoresPerSocket;
-    uint32_t numThreadsPerCore;
-    uint32_t numCacheLevels;
-    HWThread* threadPool;
-    CacheLevel*  cacheLevels;
-    TreeNode* topologyTree;
-} CpuTopology;
-
-
-#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/daemon.h b/src/includes/daemon.h
deleted file mode 100644
index 3272636..0000000
--- a/src/includes/daemon.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  daemon.h
- *
- *      Description:  Header File daemon Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef DAEMON_H
-#define DAEMON_H
-
-#include <types.h>
-#include <time.h>
-
-extern void daemon_init();
-extern void daemon_start(bstring str, struct timespec interval);
-extern void daemon_stop(int sig);
-extern void daemon_interrupt(int sig);
-
-#endif /* DAEMON_H */
diff --git a/src/includes/error.h b/src/includes/error.h
index 3c1526f..faabb2e 100644
--- a/src/includes/error.h
+++ b/src/includes/error.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Central error handling macros
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,60 +32,81 @@
 #ifndef ERROR_H
 #define ERROR_H
 
-#include <errno.h>
-#include <string.h>
-#include <msr.h>
+
+#include <likwid.h>
+
+
 
 #define str(x) #x
 
-#define FINALIZE  msr_finalize()
 
 #define ERRNO_PRINT fprintf(stderr, "ERROR - [%s:%d] %s\n", __FILE__, __LINE__, strerror(errno))
 
 #define ERROR  \
     ERRNO_PRINT; \
-    FINALIZE; \
     exit(EXIT_FAILURE)
 
 #define ERROR_PLAIN_PRINT(msg) \
-   fprintf(stderr,  "ERROR - [%s:%d] " str(msg) "\n", __FILE__, __LINE__);  \
-   FINALIZE; \
-   exit(EXIT_FAILURE)
+   fprintf(stderr,  "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__);
 
 
 #define ERROR_PRINT(fmt, ...) \
-   fprintf(stderr,  "ERROR - [%s:%d] " str(fmt) "\n", __FILE__, __LINE__, __VA_ARGS__);  \
-   FINALIZE; \
-   exit(EXIT_FAILURE)
+   fprintf(stderr,  "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__,  __func__,__LINE__, strerror(errno), __VA_ARGS__);
 
 #define CHECK_ERROR(func, msg)  \
     if ((func) < 0) { \
         fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno));  \
     }
 
+#define CHECK_AND_RETURN_ERROR(func, msg)  \
+    if ((func) < 0) { \
+        fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno));  \
+        return errno; \
+    }
+
 #define EXIT_IF_ERROR(func, msg)  \
     if ((func) < 0) {  \
         fprintf(stderr,"ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-        FINALIZE; \
         exit(EXIT_FAILURE); \
     }
 
-#ifndef DEBUGLEV
-#define DEBUGLEV 0
-#endif
+
 
 #define VERBOSEPRINTREG(cpuid,reg,flags,msg) \
-    if (perfmon_verbose) {  \
+    if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+    { \
         printf("DEBUG - [%s:%d] "  str(msg) " [%d] Register 0x%llX , Flags: 0x%llX \n",  \
-                __FILE__, __LINE__,  (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
+                __func__, __LINE__,  (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
         fflush(stdout);  \
-    } 
+    }
+    
+#define VERBOSEPRINTPCIREG(cpuid,dev,reg,flags,msg) \
+    if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+    { \
+        printf("DEBUG - [%s:%d] "  str(msg) " [%d] Device %d Register 0x%llX , Flags: 0x%llX \n",  \
+                __func__, __LINE__,  (cpuid), dev, LLU_CAST (reg), LLU_CAST (flags)); \
+        fflush(stdout);  \
+    }
 
 
 #define DEBUG_PRINT(lev, fmt, ...) \
-    if (DEBUGLEV > lev) { \
-        printf(fmt, __VA_ARGS__); \
+    if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+        fprintf(stdout, "DEBUG - [%s:%d] " str(fmt) "\n", __func__, __LINE__,__VA_ARGS__); \
+        fflush(stdout); \
+    }
+
+#define DEBUG_PLAIN_PRINT(lev, msg) \
+    if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+        fprintf(stdout, "DEBUG - [%s:%d] " str(msg) "\n",__func__, __LINE__);  \
         fflush(stdout); \
     }
 
+
+#define CHECK_MSR_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR write operation failed);
+#define CHECK_MSR_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR read operation failed);
+#define CHECK_PCI_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI write operation failed);
+#define CHECK_PCI_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI read operation failed);
+#define CHECK_POWER_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Power register read operation failed);
+#define CHECK_TEMP_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Temperature register read operation failed);
+
 #endif /*ERROR_H*/
diff --git a/src/includes/ghash.h b/src/includes/ghash.h
index f33e9fb..75a17fd 100644
--- a/src/includes/ghash.h
+++ b/src/includes/ghash.h
@@ -1,20 +1,20 @@
-/*
- * =======================================================================================
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
  */
 
 /*
@@ -59,13 +59,13 @@ typedef struct _GHashTableIter GHashTableIter;
 
 struct _GHashTableIter
 {
-    /*< private >*/
-    gpointer      dummy1;
-    gpointer      dummy2;
-    gpointer      dummy3;
-    int           dummy4;
-    gboolean      dummy5;
-    gpointer      dummy6;
+  /*< private >*/
+  gpointer      dummy1;
+  gpointer      dummy2;
+  gpointer      dummy3;
+  int           dummy4;
+  gboolean      dummy5;
+  gpointer      dummy6;
 };
 
 char* g_strdup (const char *str);
diff --git a/src/includes/hashTable.h b/src/includes/hashTable.h
index 078fff9..4da4cbf 100644
--- a/src/includes/hashTable.h
+++ b/src/includes/hashTable.h
@@ -3,17 +3,17 @@
  *
  *      Filename:  hashTable.h
  *
- *      Description:  Header File hashtable Module. 
- *                    Wrapper for HAshTable data structure holding thread
+ *      Description:  Header File hashtable Module.
+ *                    Wrapper for HashTable data structure holding thread
  *                    specific region information.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,6 +37,7 @@
 #include <types.h>
 
 extern void hashTable_init();
+void hashTable_initThread(int coreID);
 extern int hashTable_get(bstring regionTag, LikwidThreadResults** result);
 extern void hashTable_finalize(int* numberOfThreads, int* numberOfRegions, LikwidResults** results);
 
diff --git a/src/includes/libperfctr_types.h b/src/includes/libperfctr_types.h
index 99a38dc..6e375b6 100644
--- a/src/includes/libperfctr_types.h
+++ b/src/includes/libperfctr_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for libperfctr module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,15 +37,22 @@ typedef struct LikwidThreadResults{
     bstring  label;
     double time;
     TimerData startTime;
+    int groupID;
+    int cpuID;
     uint32_t count;
     double StartPMcounters[NUM_PMC];
+    int StartOverflows[NUM_PMC];
     double PMcounters[NUM_PMC];
 } LikwidThreadResults;
 
 typedef struct {
     bstring  tag;
+    int groupID;
+    int threadCount;
+    int eventCount;
     double*  time;
     uint32_t*  count;
+    int* cpulist;
     double** counters;
 } LikwidResults;
 
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index dd4cdfd..d900a0d 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  likwid.h
  *
- *      Description:  Header File of likwid marker API
+ *      Description:  Header File of likwid API
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,35 +32,1387 @@
 #ifndef LIKWID_H
 #define LIKWID_H
 
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bstrlib.h>
+
+#define DEBUGLEV_ONLY_ERROR 0
+#define DEBUGLEV_INFO 1
+#define DEBUGLEV_DETAIL 2
+#define DEBUGLEV_DEVELOP 3
+
+extern int perfmon_verbosity;
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
 #ifdef LIKWID_PERFMON
 #define LIKWID_MARKER_INIT likwid_markerInit()
 #define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
-#define LIKWID_MARKER_START(reg) likwid_markerStartRegion(reg)
-#define LIKWID_MARKER_STOP(reg) likwid_markerStopRegion(reg)
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
 #define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
 #else
 #define LIKWID_MARKER_INIT
 #define LIKWID_MARKER_THREADINIT
-#define LIKWID_MARKER_START(reg)
-#define LIKWID_MARKER_STOP(reg)
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
 #define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* marker API routines */
-extern void likwid_markerInit(void);
-extern void likwid_markerThreadInit(void);
-extern void likwid_markerClose(void);
-extern void likwid_markerStartRegion(const char* regionTag);
-extern void likwid_markerStopRegion(const char* regionTag);
 
+
+/*
+################################################################################
+# Marker API related functions
+################################################################################
+*/
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*! \brief Initialize LIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID.
+Reads environment variables:
+- LIKWID_MODE (access mode)
+- LIKWID_MASK (event bitmask)
+- LIKWID_EVENTS (event string)
+- LIKWID_THREADS (cpu list separated by ,)
+- LIKWID_GROUPS (amount of groups)
+*/
+extern void likwid_markerInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize LIKWID's marker API for the current thread
+
+Must be called in parallel region of the application to set up basic data structures
+of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit().
+
+*/
+extern void likwid_markerThreadInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Select next group to measure
+
+Must be called in parallel region of the application to switch group on every CPU.
+*/
+extern void likwid_markerNextGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Close LIKWID's marker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
+extern void likwid_markerClose(void) __attribute__ ((visibility ("default") ));
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion()
+ at param regionTag [in] Initialize data using this string
+ at return Error code
+*/
+extern int likwid_markerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Start a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag.
+ at param regionTag [in] Store data using this string
+ at return Error code of start operation
+*/
+extern int likwid_markerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag. The measurement data of the stopped region gets summed up in global region counters.
+ at param regionTag [in] Store data using this string
+ at return Error code of stop operation
+*/
+extern int likwid_markerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+ at param regionTag [in] Print data using this string
+ at param nr_events [in,out] Length of events array
+ at param events [out] Events array for the intermediate results
+ at param time [out] Accumulated measurement time
+ at param count [out] Call count of the code region
+*/
+extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count) __attribute__ ((visibility ("default") ));
 /* utility routines */
-extern int  likwid_getProcessorId();
-extern int  likwid_pinProcess(int processorId);
-extern int  likwid_pinThread(int processorId);
+/*! \brief Get CPU ID of the current process/thread
+
+Returns the ID of the CPU the current process or thread is running on.
+ at return current CPU ID
+*/
+extern int  likwid_getProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current process to given CPU
+
+Pin the current process to the given CPU ID. The process cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function.
+ at param [in] processorId CPU ID to pin the current process to
+ at return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current thread to given CPU
+
+Pin the current thread to the given CPU ID. The thread cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function
+ at param [in] processorId CPU ID to pin the current thread to
+ at return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Access client related functions
+################################################################################
+*/
+/** \addtogroup Access Access module
+ *  @{
+ */
+
+/*! \brief Enum for the access modes
+
+LIKWID supports multiple access modes to the MSR and PCI performance monitoring
+registers. For direct access the user must have enough priviledges to access the
+MSR and PCI devices. The daemon mode forwards the operations to a daemon with
+higher priviledges.
+*/
+typedef enum {
+    ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */
+    ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */
+} AccessMode;
+
+/*! \brief Set access mode
+
+Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before HPMinit()
+ at param [in] mode (0=direct, 1=daemon)
+*/
+extern void HPMmode(int mode) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize access module
+
+Initialize the module internals to either the MSR/PCI files or the access daemon
+ at return error code (0 for sccess)
+*/
+extern int HPMinit() __attribute__ ((visibility ("default") ));
+/*! \brief Add CPU to access module
+
+Add the given CPU to the access module. This opens the commnunication to either the MSR/PCI files or the access daemon.
+ at param [in] cpu_id CPU that should be enabled for measurements
+ at return error code (0 for success, -ENODEV if access cannot be initialized
+*/
+extern int HPMaddThread(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Close connections
+
+Close the connections to the MSR/PCI files or the access daemon
+*/
+extern void HPMfinalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Config file related functions
+################################################################################
+*/
+/** \addtogroup Config Config file module
+*  @{
+*/
+/*! \brief Structure holding values of the configuration file
+
+LIKWID supports the definition of runtime values in a configuration file. The
+most important configurations in most cases are the path the access daemon and
+the corresponding access mode. In order to avoid reading in the system topology
+at each start, a path to a topology file can be set. The other values are mostly
+used internally.
+*/
+typedef struct {
+    char* configFileName; /*!< \brief Path to the configuration file */
+    char* topologyCfgFileName; /*!< \brief Path to the topology file */
+    char* daemonPath; /*!< \brief Path of the access daemon */
+    char* groupPath; /*!< \brief Path of default performance group directory */
+    AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */
+    int maxNumThreads; /*!< \brief Maximum number of HW threads */
+    int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */
+} Configuration;
+
+/** \brief Pointer for exporting the Configuration data structure */
+typedef Configuration* Configuration_t;
+/*! \brief Read the config file of LIKWID, if it exists
+
+Search for LIKWID config file and read the values in
+Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path
+defined in config.mk are checked.
+ at return error code (0 for success, -EFAULT if no file can be found)
+*/
+extern int init_configuration(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy the config structure
+
+Destroys the current config structure and frees all allocated memory for path names
+ at return error code (0 for success, -EFAULT if config structure not initialized)
+*/
+extern int destroy_configuration(void) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Retrieve the config structure
+
+Get the initialized configuration
+\sa Configuration_t
+ at return Configuration_t (pointer to internal Configuration structure)
+*/
+extern Configuration_t get_configuration(void) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set group path in the config struction
+
+Set group path in the config struction. The path must be a directory.
+ at param [in] path
+ at return error code (0 for success, -ENOMEM if reallocation failed, -ENOTDIR if no directoy)
+*/
+extern int config_setGroupPath(char* path) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+/*
+################################################################################
+# CPU topology related functions
+################################################################################
+*/
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Structure with general CPU information
+
+General information covers CPU family, model, name and current clock and vendor
+specific information like the version of Intel's performance monitoring facility.
+*/
+typedef struct {
+    uint32_t family; /*!< \brief CPU family ID*/
+    uint32_t model; /*!< \brief CPU model ID */
+    uint32_t stepping; /*!< \brief Stepping (version) of the CPU */
+    uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/
+    int      turbo; /*!< \brief Flag if CPU has a turbo mode */
+    char*  osname; /*!< \brief Name of the CPU reported by OS */
+    char*  name; /*!< \brief Name of the CPU as identified by LIKWID */
+    char*  short_name; /*!< \brief Short name of the CPU*/
+    char*  features; /*!< \brief String with all features supported by the CPU*/
+    int         isIntel; /*!< \brief Flag if it is an Intel CPU*/
+    int     supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+    uint32_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+    uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+    uint32_t perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
+    uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+    uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
+} CpuInfo;
+
+/*! \brief Structure with IDs of a HW thread
+
+For each HW thread this structure stores the ID of the thread inside a CPU, the
+CPU core ID of the HW thread and the CPU socket ID.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */
+    uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */
+    uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */
+    uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */
+    uint32_t inCpuSet; /*!< \brief ID of HW thread inside the CPU core */
+} HWThread;
+
+/*! \brief Enum of possible caches
+
+CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs
+\extends CacheLevel
+*/
+typedef enum {
+    NOCACHE=0, /*!< \brief No cache used as undef value */
+    DATACACHE, /*!< \brief Cache holding data cache lines */
+    INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */
+    UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */
+    ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */
+    DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */
+} CacheType;
+
+/*! \brief Structure describing a cache level
+
+CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t level; /*!< \brief Level of the cache in the hierarchy */
+    CacheType type; /*!< \brief Type of the cache */
+    uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */
+    uint32_t sets; /*!< \brief Amount of sets */
+    uint32_t lineSize; /*!< \brief Size in bytes of one cache line */
+    uint32_t size; /*!< \brief Size in bytes of the cache */
+    uint32_t threads; /*!< \brief Number of HW thread connected to the cache */
+    uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */
+} CacheLevel;
+
+/*! \brief Structure describing the topology of the HW threads in the system
+
+This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled.
+*/
+typedef struct {
+    uint32_t numHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */
+    uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */
+    uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */
+    uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */
+    HWThread* threadPool; /*!< \brief List of all HW thread descriptions */
+    CacheLevel*  cacheLevels; /*!< \brief List of all caches in the hierarchy */
+    struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */
+} CpuTopology;
+
+/*! \brief Variable holding the global cpu information structure */
+extern CpuInfo cpuid_info;
+/*! \brief Variable holding the global cpu topology structure */
+extern CpuTopology cpuid_topology;
+
+/** \brief Pointer for exporting the CpuInfo data structure */
+typedef CpuInfo* CpuInfo_t;
+/** \brief Pointer for exporting the CpuTopology data structure */
+typedef CpuTopology* CpuTopology_t;
+/*! \brief Initialize topology information
+
+CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg are checked.
+\sa CpuInfo_t and CpuTopology_t
+ at return always 0
+*/
+extern int topology_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU topology of the current machine
+
+\sa CpuTopology_t
+ at return CpuTopology_t (pointer to internal cpuid_topology structure)
+*/
+extern CpuTopology_t get_cpuTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU information of the current machine
+
+Get the previously initialized CPU info structure containing number of CPUs/Threads
+\sa CpuInfo_t
+ at return CpuInfo_t (pointer to internal cpuid_info structure)
+*/
+extern CpuInfo_t get_cpuInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t.
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa CpuInfo_t and CpuTopology_t
+*/
+extern void topology_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Print all supported architectures
+*/
+extern void print_supportedCPUs(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# NUMA related functions
+################################################################################
+*/
+/** \addtogroup NumaTopology NUMA memory topology module
+ *  @{
+ */
+/*! \brief CPUs in NUMA node and general information about a NUMA domain
+
+The NumaNode structure describes the topology and holds general information of a
+NUMA node. The structure is filled by calling numa_init() by either the HWLOC
+library or by evaluating the /proc filesystem.
+\extends NumaTopology
+*/
+typedef struct {
+    uint32_t id; /*!< \brief ID of the NUMA node */
+    uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */
+    uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */
+    uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */
+    uint32_t*  processors; /*!< \brief List of HW threads in the NUMA node */
+    uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self  */
+    uint32_t*  distances; /*!< \brief List of distances to the other NUMA nodes and self */
+} NumaNode;
+
+
+/*! \brief  The NumaTopology structure describes all NUMA nodes in the current system.
+*/
+typedef struct {
+    uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes  */
+    NumaNode* nodes; /*!< \brief List of NUMA nodes */
+} NumaTopology;
+
+/*! \brief Variable holding the global NUMA information structure */
+extern NumaTopology numa_info;
+
+/** \brief Pointer for exporting the NumaTopology data structure */
+typedef NumaTopology* NumaTopology_t;
+
+/*! \brief Initialize NUMA information
+
+Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If
+a topology config file is present it is read at topology_init() and fills \a NumaTopology_t
+\sa NumaTopology_t
+ at return error code (0 for success, -1 if initialization failed)
+*/
+extern int numa_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve NUMA information of the current machine
+
+Get the previously initialized NUMA info structure
+\sa NumaTopology_t
+ at return NumaTopology_t (pointer to internal numa_info structure)
+*/
+extern NumaTopology_t get_numaTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Set memory allocation policy to interleaved
+
+Set the memory allocation policy to interleaved for given list of CPUs
+ at param [in] processorList List of processors
+ at param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setInterleaved(int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/*! \brief Allocate memory from a specific specific NUMA node
+ at param [in,out] ptr Start pointer of memory
+ at param [in] size Size for the allocation
+ at param [in] domainId ID of NUMA node for the allocation
+*/
+extern void numa_membind(void* ptr, size_t size, int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy NUMA information structure
+
+Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa NumaTopology_t
+*/
+extern void numa_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve the number of NUMA nodes
+
+Returns the number of NUMA nodes of the current machine. Can also be read out of
+NumaTopology_t
+\sa NumaTopology_t
+ at return Number of NUMA nodes
+*/
+extern int likwid_getNumberOfNodes(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# Affinity domains related functions
+################################################################################
+*/
+/** \addtogroup AffinityDomains Thread affinity module
+ *  @{
+ */
+
+/*! \brief The AffinityDomain data structure describes a single domain in the current system
+
+The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains.
+\extends AffinityDomains
+*/
+typedef struct {
+    bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */
+    uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */
+    uint32_t numberOfCores; /*!< \brief Number of CPU cores in the domain */
+    int*  processorList; /*!< \brief List of HW thread IDs in the domain */
+} AffinityDomain;
+
+/*! \brief The AffinityDomains data structure holds different count variables describing the
+various system layers
+
+Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC
+(Last Level Cache) cache domains of the current machine. Moreover a list of
+\a domains holds the processor lists for each domain that are used for
+scheduling processes to domain specific HW threads. Some amounts are duplicates
+or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology.
+*/
+typedef struct {
+    uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */
+    uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */
+    uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */
+    uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */
+    uint32_t numberOfCoresPerCache; /*!< \brief Number of HW threads per LLC cache in the system */
+    uint32_t numberOfProcessorsPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */
+    uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system  and length of \a domains array */
+    AffinityDomain* domains; /*!< \brief List of all domains in the system */
+} AffinityDomains;
+
+/** \brief Pointer for exporting the AffinityDomains data structure */
+typedef AffinityDomains* AffinityDomains_t;
+
+/*! \brief Initialize affinity information
+
+Initialize affinity information AffinityDomains_t using the data of the structures
+\a CpuInfo_t, CpuTopology_t and NumaTopology_t
+\sa AffinityDomains_t
+*/
+extern void affinity_init() __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve affinity structure
+
+Get the previously initialized affinity info structure
+\sa AffinityDomains_t
+ at return AffinityDomains_t (pointer to internal affinityDomains structure)
+*/
+extern AffinityDomains_t get_affinityDomains(void) __attribute__ ((visibility ("default") ));
+/*! \brief Pin process to a CPU
+
+Pin process to a CPU. Duplicate of likwid_pinProcess()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin processes to a CPU
+
+Pin processes to a CPU. Creates a cpuset with the given processor IDs
+ at param [in] cpu_count Number of processors in processorIds
+ at param [in] processorIds Array of processor IDs
+*/
+extern void affinity_pinProcesses(int cpu_count, int* processorIds) __attribute__ ((visibility ("default") ));
+/*! \brief Pin thread to a CPU
+
+Pin thread to a CPU. Duplicate of likwid_pinThread()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current process runs.
+
+ at return CPU ID
+*/
+extern int affinity_processGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current thread runs.
+
+ at return CPU ID
+*/
+extern int affinity_threadGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Destroy affinity information structure
+
+Destroys the affinity information structure AffinityDomains_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa AffinityDomains_t
+*/
+extern void affinity_finalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU string parsing related functions
+################################################################################
+*/
+/** \addtogroup CPUParse CPU string parser module
+ *  @{
+ */
+
+/*! \brief Read CPU selection string and resolve to available CPU numbers
+
+Reads the CPU selection string and fills the given list with the CPU numbers
+defined in the selection string. This function is a interface function for the
+different selection modes: scatter, expression, logical and physical.
+ at param [in] cpustring Selection string
+ at param [in,out] cpulist List of CPUs
+ at param [in] length Length of cpulist
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int cpustr_to_cpulist(char* cpustring, int* cpulist, int length)  __attribute__ ((visibility ("default") ));
+/*! \brief Read NUMA node selection string and resolve to available NUMA node numbers
+
+Reads the NUMA node selection string and fills the given list with the NUMA node numbers
+defined in the selection string.
+ at param [in] nodestr Selection string
+ at param [out] nodes List of available NUMA nodes
+ at param [in] length Length of NUMA node list
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int nodestr_to_nodelist(char* nodestr, int* nodes, int length)  __attribute__ ((visibility ("default") ));
+/*! \brief Read CPU socket selection string and resolve to available CPU socket numbers
+
+Reads the CPU socket selection string and fills the given list with the CPU socket numbers
+defined in the selection string.
+ at param [in] sockstr Selection string
+ at param [out] sockets List of available CPU sockets
+ at param [in] length Length of CPU socket list
+ at return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int sockstr_to_socklist(char* sockstr, int* sockets, int length)  __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Performance monitoring related functions
+################################################################################
+*/
+/** \addtogroup PerfMon Performance monitoring module
+ *  @{
+ */
+/*! \brief Get all groups
+
+Checks the configured performance group path for the current architecture and
+returns all found group names
+ at return Amount of found performance groups
+*/
+extern int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free all group information
+
+ at param [in] nrgroups Number of groups
+ at param [in] groups List of group names
+ at param [in] shortinfos List of short information string about group
+ at param [in] longinfos List of long information string about group
+*/
+extern void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring facility
+
+Initialize the performance monitoring feature by creating basic data structures.
+The access mode must already be set when calling perfmon_init()
+ at param [in] nrThreads Amount of threads
+ at param [in] threadsToCpu List of CPUs
+ at return error code (0 on success, -ERRORCODE on failure)
+*/
+extern int perfmon_init(int nrThreads, int threadsToCpu[]) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring maps
+
+Initialize the performance monitoring maps for counters, events and Uncore boxes#
+for the current architecture. topology_init() and numa_init() must be called before calling
+perfmon_init_maps()
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_init_maps(void) __attribute__ ((visibility ("default") ));
+/*! \brief Add an event string to LIKWID
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] eventCString Event string
+ at return Returns the ID of the new eventSet
+*/
+extern int perfmon_addEventSet(char* eventCString) __attribute__ ((visibility ("default") ));
+/*! \brief Setup all performance monitoring counters of an eventSet
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] groupId (returned from perfmon_addEventSet()
+ at return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Start performance monitoring counters
+
+Start the counters that have been previously set up by perfmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_startCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Stop performance monitoring counters
+
+Stop the counters that have been previously started by perfmon_startCounters().
+All config registers get zeroed before reading the counter register.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_stopCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on all CPUs
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on one CPU
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one CPU is read.
+ at param [in] cpu_id CPU ID of the CPU that should be read
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCountersCpu(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of all threads in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at param [in] groupId Read the counters for all threads taking part in group
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of on thread in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one thread's CPU is read.
+ at param [in] groupId Read the counters for on thread taking part in group
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupThreadCounters(int groupId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Switch the active eventSet to a new one
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+ at param [in] new_group ID of group that should be switched to.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") ));
+/*! \brief Close the perfomance monitoring facility of LIKWID
+
+Deallocates all internal data that is used during performance monitoring. Also
+the counter values are not accessible after this function.
+*/
+extern void perfmon_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the results of the specified group, counter and thread
+
+Get the result of all measurement cycles. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last results of the specified group, counter and thread
+
+Get the result of the last measurement cycle. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getLastResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of the specified group, counter and thread
+
+Get the metric result of all measurement cycles. It reads all raw results for the given groupId and threadId.
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The metric result
+*/
+extern double perfmon_getMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last metric result of the specified group, counter and thread
+
+Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and threadId.
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The metric result
+*/
+extern double perfmon_getLastMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured event groups
+
+ at return Number of groups
+*/
+extern int perfmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of configured eventSets in group
+
+ at param [in] groupId ID of group
+ at return Number of eventSets
+*/
+extern int perfmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured
+*/
+extern double perfmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the ID of the currently set up event group
+
+ at return Number of active group
+*/
+extern int perfmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads specified at perfmon_init()
+
+ at return Number of threads
+*/
+extern int perfmon_getNumberOfThreads(void) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Set verbosity of LIKWID library
+
+*/
+extern void perfmon_setVerbosity(int verbose) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the event name of the specified group and event
+
+Get the metric name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be returned
+ at return The event name or NULL in case of failure
+*/
+extern char* perfmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the counter name of the specified group and event
+
+Get the counter name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event of which the counter should be returned
+ at return The counter name or NULL in case of failure
+*/
+extern char* perfmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the name group
+
+Get the name of group. Either it is the name of the performance group or "Custom"
+ at param [in] groupId ID of the group that should be read
+ at return The group name or NULL in case of failure
+*/
+extern char* perfmon_getGroupName(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric name of the specified group and metric
+
+Get the metric name as defined in the performance group file
+ at param [in] groupId ID of the group that should be read
+ at param [in] metricId ID of the metric that should be calculated
+ at return The metric name or NULL in case of failure
+*/
+extern char* perfmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short informational string of the specified group
+
+Returns the short information string as defined by performance groups or "Custom"
+in case of custom event sets
+ at param [in] groupId ID of the group that should be read
+ at return The short information or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long descriptive string of the specified group
+
+Returns the long descriptive string as defined by performance groups or NULL
+in case of custom event sets
+ at param [in] groupId ID of the group that should be read
+ at return The long description or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured metrics for group
+
+ at param [in] groupId ID of group
+ at return Number of metrics
+*/
+extern int perfmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the last measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured the last time
+*/
+extern double perfmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read the output file of the Marker API
+ at param [in] filename Filename with Marker API results
+ at return 0 or negative error number
+*/
+extern int perfmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") ));
+/*! \brief Free space for read in Marker API file
+*/
+extern void perfmon_destroyMarkerResults() __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of regions listed in Marker API result file
+
+ at return Number of regions
+*/
+extern int perfmon_getNumberOfRegions() __attribute__ ((visibility ("default") ));
+/*! \brief Get the groupID of a region
+
+ at param [in] region ID of region
+ at return Group ID of region
+*/
+extern int perfmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the tag of a region
+ at param [in] region ID of region
+ at return tag of region
+*/
+extern char* perfmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of events of a region
+ at param [in] region ID of region
+ at return Number of events of region
+*/
+extern int perfmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of metrics of a region
+ at param [in] region ID of region
+ at return Number of metrics of region
+*/
+extern int perfmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads of a region
+ at param [in] region ID of region
+ at return Number of threads of region
+*/
+extern int perfmon_getThreadsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the cpulist of a region
+ at param [in] region ID of region
+ at param [in] count Length of cpulist array
+ at param [in] cpulist cpulist array
+ at return Number of threads of region or count, whatever is lower
+*/
+extern int perfmon_getCpulistOfRegion(int region, int count, int* cpulist)  __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time of a region for a thread
+ at param [in] region ID of region
+ at param [in] thread ID of thread
+ at return Measurement time of a region for a thread
+*/
+extern double perfmon_getTimeOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the call count of a region for a thread
+ at param [in] region ID of region
+ at param [in] thread ID of thread
+ at return Call count of a region for a thread
+*/
+extern int perfmon_getCountOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the event result of a region for an event and thread
+ at param [in] region ID of region
+ at param [in] event ID of event
+ at param [in] thread ID of thread
+ at return Result of a region for an event and thread
+*/
+extern double perfmon_getResultOfRegionThread(int region, int event, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of a region for a metric and thread
+ at param [in] region ID of region
+ at param [in] metricId ID of metric
+ at param [in] threadId ID of thread
+ at return Metric result of a region for a thread
+*/
+extern double perfmon_getMetricOfRegionThread(int region, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Time measurements related functions
+################################################################################
+*/
+
+/** \addtogroup TimerMon Time measurement module
+ *  @{
+ */
+
+/*! \brief Struct defining the start and stop time of a time interval
+\extends TimerData
+*/
+typedef union
+{
+    uint64_t int64; /*!< \brief Cycle count in 64 bit */
+    struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */
+} TscCounter;
+
+/*! \brief Struct defining the start and stop time of a time interval
+*/
+typedef struct {
+    TscCounter start; /*!< \brief Cycles at start */
+    TscCounter stop; /*!< \brief Cycles at stop */
+} TimerData;
+
+/*! \brief Initialize timer by retrieving baseline frequency and cpu clock
+*/
+extern void timer_init( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in seconds
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in seconds
+*/
+extern double timer_print( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in cycles
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in cycles
+*/
+extern uint64_t timer_printCycles( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Reset values in TimerData
+
+ at param [in] time Structure holding the cycle count at start and stop
+*/
+extern void timer_reset( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU clock determined at timer_init
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the current CPU clock read from sysfs
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClockCurrent( int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the cycles clock determined at timer_init
+
+ at return cycle clock
+*/
+extern uint64_t timer_getCycleClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the baseline CPU clock determined at timer_init
+
+ at return Baseline CPU clock
+*/
+extern uint64_t timer_getBaseline( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Start time measurement
+
+ at param [in,out] time Structure holding the cycle count at start
+*/
+extern void timer_start( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Stop time measurement
+
+ at param [in,out] time Structure holding the cycle count at stop
+*/
+extern void timer_stop ( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Sleep for specified usecs
+
+ at param [in] usec Amount of usecs to sleep
+*/
+extern int timer_sleep(unsigned long usec) __attribute__ ((visibility ("default") ));
+
+/*! \brief Finalize timer module
+
+*/
+extern void timer_finalize(void) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Power measurements related functions
+################################################################################
+*/
+/** \addtogroup PowerMon Power and Energy monitoring module
+ *  @{
+ */
+
+/*!
+\def NUM_POWER_DOMAINS
+Amount of currently supported RAPL domains
+*/
+#define NUM_POWER_DOMAINS 4
+/*! \brief List of all RAPL domain names
+*/
+extern const char* power_names[NUM_POWER_DOMAINS] __attribute__ ((visibility ("default") ));
+
+/*!
+\def POWER_DOMAIN_SUPPORT_STATUS
+Flag to check in PowerDomain's supportFlag if the status msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0)
+/*!
+\def POWER_DOMAIN_SUPPORT_LIMIT
+Flag to check in PowerDomain's supportFlag if the limit msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1)
+/*!
+\def POWER_DOMAIN_SUPPORT_POLICY
+Flag to check in PowerDomain's supportFlag if the policy msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2)
+/*!
+\def POWER_DOMAIN_SUPPORT_PERF
+Flag to check in PowerDomain's supportFlag if the perf msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3)
+/*!
+\def POWER_DOMAIN_SUPPORT_INFO
+Flag to check in PowerDomain's supportFlag if the info msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4)
+
+
+/*! \brief Information structure of CPU's turbo mode
+\extends PowerInfo
+*/
+typedef struct {
+    int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */
+    double* steps; /*!< \brief List of turbo mode steps */
+} TurboBoost;
+
+/*! \brief Enum for all supported RAPL domains
+\extends PowerDomain
+*/
+typedef enum {
+    PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */
+    PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */
+    PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */
+    DRAM = 3 /*!< \brief DRAM domain, the memory modules */
+} PowerType;
+
+/*! \brief Structure describing an RAPL power domain
+\extends PowerInfo
+*/
+typedef struct {
+    PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */
+    uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */
+    double energyUnit; /*!< \brief Multiplier for energy measurements */
+    double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */
+    double minPower; /*!< \brief Minimal power consumption of the CPU */
+    double maxPower; /*!< \brief Maximal power consumption of the CPU */
+    double maxTimeWindow; /*!< \brief Minimal power measurement interval */
+} PowerDomain;
+
+/*! \brief Information structure of CPU's power measurement facility
+*/
+typedef struct {
+    double baseFrequency; /*!< \brief Base frequency of the CPU */
+    double minFrequency; /*!< \brief Minimal frequency of the CPU */
+    TurboBoost turbo; /*!< \brief Turbo boost information */
+    int hasRAPL; /*!< \brief RAPL support flag */
+    double powerUnit; /*!< \brief Multiplier for power measurements */
+    double timeUnit; /*!< \brief Multiplier for time information */
+    PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */
+} PowerInfo;
+
+/*! \brief Power measurement data for start/stop measurements
+*/
+typedef struct {
+    int domain; /*!< \brief RAPL domain identifier */
+    uint32_t before; /*!< \brief Counter state at start */
+    uint32_t after; /*!< \brief Counter state at stop */
+} PowerData;
+
+/*! \brief Variable holding the global power information structure */
+extern PowerInfo power_info;
+
+/** \brief Pointer for exporting the PowerInfo data structure */
+typedef PowerInfo* PowerInfo_t;
+/** \brief Pointer for exporting the PowerData data structure */
+typedef PowerData* PowerData_t;
+
+/*! \brief Initialize energy measurements on specific CPU
+
+Additionally, it reads basic information about the energy measurements like
+minimal measurement time.
+ at param [in] cpuId Initialize energy facility for this CPU
+ at return error code
+*/
+extern int power_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get a pointer to the energy facility information
+
+ at return PowerInfo_t pointer
+\sa PowerInfo_t
+*/
+extern PowerInfo_t get_powerInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current power value
+
+ at param [in] cpuId Read energy facility for this CPU
+ at param [in] reg Energy register
+ at param [out] data Energy data
+*/
+extern int power_read(int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current energy value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read energy facility for this CPU
+ at param [in] reg Energy register
+ at param [out] data Energy data
+*/
+extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Start energy measurements
+
+ at param [in,out] data Data structure holding start and stop values for energy measurements
+ at param [in] cpuId Start energy facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_start(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Stop energy measurements
+
+ at param [in,out] data Data structure holding start and stop values for energy measurements
+ at param [in] cpuId Start energy facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_stop(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Print energy measurements gathered by power_start() and power_stop()
+
+ at param [in] data Data structure holding start and stop values for energy measurements
+ at return Consumed energy in Joules
+*/
+extern double power_printEnergy(PowerData* data) __attribute__ ((visibility ("default") ));
+/*! \brief Get energy Unit
+
+ at param [in] domain RAPL domain ID
+ at return Energy unit of the given RAPL domain
+*/
+extern double power_getEnergyUnit(int domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the values of the limit register of a domain
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [out] power Energy limit
+ at param [out] time Time limit
+ at return error code
+*/
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the values of the limit register of a domain
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [in] power Energy limit
+ at param [in] time Time limit
+ at param [in] doClamping Activate clamping (going below OS-requested power level)
+ at return error code
+*/
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the state of a energy limit, activated or deactivated
+NOT IMPLEMENTED
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at return state, 1 for active, 0 for inactive
+*/
+int power_limitState(int cpuId, PowerType domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free space of power_unit
+*/
+extern void power_finalize(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Thermal measurements related functions
+################################################################################
+*/
+/** \addtogroup ThermalMon Thermal monitoring module
+ *  @{
+ */
+/*! \brief Initialize thermal measurements on specific CPU
+
+ at param [in] cpuId Initialize thermal facility for this CPU
+*/
+extern void thermal_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value
+
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_read(int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+
+/*
+################################################################################
+# Memory sweeping related functions
+################################################################################
+*/
+/** \addtogroup MemSweep Memory sweeping module
+ *  @{
+ */
+/*! \brief Sweeping the memory of a NUMA node
+
+Sweeps (zeros) the memory of NUMA node with ID \a domainId
+ at param [in] domainId NUMA node ID
+*/
+extern void memsweep_domain(int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list
+
+Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList
+ at param [in] processorList List of CPU IDs
+ at param [in] numberOfProcessors Number of CPUs in list
+*/
+extern void memsweep_threadGroup(int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU feature related functions
+################################################################################
+*/
+/** \addtogroup CpuFeatures Retrieval and manipulation of processor features
+ *  @{
+ */
+
+typedef enum {
+    FEAT_HW_PREFETCHER=0, /*!< \brief Hardware prefetcher */
+    FEAT_CL_PREFETCHER, /*!< \brief Adjacent cache line prefetcher */
+    FEAT_DCU_PREFETCHER, /*!< \brief DCU L1 data cache prefetcher */
+    FEAT_IP_PREFETCHER, /*!< \brief IP L1 data cache prefetcher */
+    FEAT_FAST_STRINGS, /*!< \brief Fast-strings feature */
+    FEAT_THERMAL_CONTROL, /*!< \brief Automatic Thermal Control Circuit */
+    FEAT_PERF_MON, /*!< \brief Hardware performance monitoring */
+    FEAT_FERR_MULTIPLEX, /*!< \brief FERR# Multiplexing, must be 1 for XAPIC interrupt model */
+    FEAT_BRANCH_TRACE_STORAGE, /*!< \brief Branch Trace Storage */
+    FEAT_XTPR_MESSAGE, /*!< \brief xTPR Message to set processor priority */
+    FEAT_PEBS, /*!< \brief Precise Event Based Sampling (PEBS) */
+    FEAT_SPEEDSTEP, /*!< \brief Enhanced Intel SpeedStep Technology to reduce energy consumption*/
+    FEAT_MONITOR, /*!< \brief MONITOR/MWAIT feature to monitor write-back stores*/
+    FEAT_SPEEDSTEP_LOCK, /*!< \brief Enhanced Intel SpeedStep Technology Select Lock */
+    FEAT_CPUID_MAX_VAL, /*!< \brief Limit CPUID Maxval */
+    FEAT_XD_BIT, /*!< \brief Execute Disable Bit */
+    FEAT_DYN_ACCEL, /*!< \brief Intel Dynamic Acceleration */
+    FEAT_TURBO_MODE, /*!< \brief Intel Turbo Mode */
+    FEAT_TM2, /*!< \brief Thermal Monitoring 2 */
+    CPUFEATURES_MAX 
+} CpuFeature;
+
+/*! \brief Initialize the internal feature variables for all CPUs
+
+Initialize the internal feature variables for all CPUs
+*/
+extern void cpuFeatures_init() __attribute__ ((visibility ("default") ));
+/*! \brief Print state of all CPU features for a given CPU
+
+Print state of all CPU features for a given CPU
+ at param [in] cpu CPU ID
+*/
+extern void cpuFeatures_print(int cpu) __attribute__ ((visibility ("default") ));
+/*! \brief Get state of a CPU feature for a given CPU
+
+Get state of a CPU feature for a given CPU
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return State of CPU feature (1=enabled, 0=disabled)
+*/
+extern int cpuFeatures_get(int cpu, CpuFeature type)  __attribute__ ((visibility ("default") ));
+/*! \brief Get the name of a CPU feature
+
+Get the name of a CPU feature
+ at param [in] type CPU feature
+ at return Name of the CPU feature or NULL if feature is not available
+*/
+extern char* cpuFeatures_name(CpuFeature type)  __attribute__ ((visibility ("default") ));
+/*! \brief Enable a CPU feature for a specific CPU
+
+Enable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/*! \brief Disable a CPU feature for a specific CPU
+
+Disable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+ at param [in] cpu CPU ID
+ at param [in] type CPU feature
+ at return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/** @}*/
 
 #ifdef __cplusplus
 }
diff --git a/src/includes/lock.h b/src/includes/lock.h
index 87d1593..93f3d9b 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header File Locking primitive Module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/memsweep.h b/src/includes/memsweep.h
index e29d4d8..de7a7b0 100644
--- a/src/includes/memsweep.h
+++ b/src/includes/memsweep.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  memsweep.h
  *
- *      Description:  Header File memsweep Module. 
+ *      Description:  Header File memsweep module for internal use. External functions are
+ *                    defined in likwid.h
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,9 +35,7 @@
 #include <types.h>
 
 extern void memsweep_setMemoryFraction(uint64_t fraction);
-extern void memsweep_node(FILE* OUTSTREAM);
-extern void memsweep_domain(FILE* OUTSTREAM, int domainId);
-extern void memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors);
+extern void memsweep_node(void);
 
 #endif /* MEMSWEEP_H */
 
diff --git a/src/includes/msr.h b/src/includes/msr.h
deleted file mode 100644
index 45f8069..0000000
--- a/src/includes/msr.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  msr.h
- *
- *      Description:  Header File msr Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MSR_H
-#define MSR_H
-
-#include <types.h>
-
-/* Initializes the MSR module, trying to open either the MSR files or
- * the connection to the msr daemon. */
-extern void msr_init(int socket_fd);
-extern void msr_finalize(void);
-extern uint64_t msr_read(int cpu, uint32_t reg);
-extern void msr_write(int cpu, uint32_t reg, uint64_t data);
-
-/* variants for thread safe execution with a per thread socket */
-extern uint64_t msr_tread(int socket_fd, int cpu, uint32_t reg);
-extern void msr_twrite(int socket_fd, int cpu, uint32_t reg, uint64_t data);
-
-#endif /* MSR_H */
diff --git a/src/includes/multiplex.h b/src/includes/multiplex.h
deleted file mode 100644
index c34cac8..0000000
--- a/src/includes/multiplex.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex.h
- *
- *      Description:  Header File multiplex Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_H
-#define MULTIPLEX_H
-
-#include <types.h>
-
-extern void multiplex_init(MultiplexCollections* set);
-extern void multiplex_start();
-extern void multiplex_stop();
-
-#endif /* MULTIPLEX_H */
diff --git a/src/includes/multiplex_types.h b/src/includes/multiplex_types.h
deleted file mode 100644
index 8578a8f..0000000
--- a/src/includes/multiplex_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex_types.h
- *
- *      Description:  Types file for multiplex  module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_TYPES_H
-#define MULTIPLEX_TYPES_H
-
-typedef struct {
-    PerfmonEventSet* collections;
-    int numberOfCollections;
-    double time;
-} MultiplexCollections;
-
-
-
-#endif /* MULTIPLEX_TYPES_H */
diff --git a/src/includes/numa.h b/src/includes/numa.h
index 3a2d0f1..3ca582f 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  numa.h
  *
- *      Description:  Header File numa Module. 
+ *      Description:  Header File NUMA module for internal use. External functions are
+ *                    defined in likwid.h
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,18 +29,30 @@
  * =======================================================================================
  */
 
-#ifndef NUMA_H
-#define NUMA_H
+#ifndef LIKWID_NUMA
+#define LIKWID_NUMA
+
+#include <stdlib.h>
+#include <stdio.h>
 
 #include <types.h>
+#include <likwid.h>
+#include <numa_hwloc.h>
+#include <numa_proc.h>
+
+
+
+
+extern int str2int(const char* str);
+
+struct numa_functions {
+    int (*numa_init) (void);
+    void (*numa_setInterleaved) (int*, int);
+    void (*numa_membind) (void*, size_t, int);
+};
+
+
 
-/** Structure holding numa information
- *
- */
-extern NumaTopology numa_info;
 
-extern int numa_init (void);
-extern void numa_setInterleaved(int* processorList, int numberOfProcessors);
-extern void numa_membind(void* ptr, size_t size, int domainId);
 
-#endif /*NUMA_H*/
+#endif
diff --git a/src/includes/numa_hwloc.h b/src/includes/numa_hwloc.h
new file mode 100644
index 0000000..cf74238
--- /dev/null
+++ b/src/includes/numa_hwloc.h
@@ -0,0 +1,40 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_hwloc.h
+ *
+ *      Description:  Header File hwloc NUMA backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef LIKWID_NUMA_HWLOC
+#define LIKWID_NUMA_HWLOC
+
+extern int hwloc_numa_init(void);
+extern void hwloc_numa_membind(void* ptr, size_t size, int domainId);
+extern void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_proc.h b/src/includes/numa_proc.h
new file mode 100644
index 0000000..71af378
--- /dev/null
+++ b/src/includes/numa_proc.h
@@ -0,0 +1,39 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_proc.h
+ *
+ *      Description:  Header File procfs/sysfs NUMA backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_NUMA_PROC
+#define LIKWID_NUMA_PROC
+
+extern int proc_numa_init(void);
+extern void proc_numa_membind(void* ptr, size_t size, int domainId);
+extern void proc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_types.h b/src/includes/numa_types.h
deleted file mode 100644
index bd4afda..0000000
--- a/src/includes/numa_types.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  numa_types.h
- *
- *      Description:  Types file for numa module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef NUMA_TYPES_H
-#define NUMA_TYPES_H
-
-
-typedef struct {
-    int id;
-    uint64_t totalMemory;
-    uint64_t freeMemory;
-    int numberOfProcessors;
-    uint32_t* processors;
-    uint32_t* processorsCompact;
-    int numberOfDistances;
-    uint32_t* distances;
-} NumaNode;
-
-typedef struct {
-    uint32_t numberOfNodes;
-    NumaNode* nodes;
-} NumaTopology;
-
-
-#endif /*NUMA_TYPES_H*/
diff --git a/src/includes/pci.h b/src/includes/pci.h
deleted file mode 100644
index 1672f1c..0000000
--- a/src/includes/pci.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  pci.h
- *
- *      Description:  Header File pci Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef PCI_H
-#define PCI_H
-
-#include <types.h>
-
-
-/* PCI config memory space access is addressed
- * BUS - DEVICE - FUNCTION
- * Listing for Uncore devices DEVICE.FUNCTION
- */
-
-extern void pci_init();
-extern void pci_finalize();
-extern uint32_t pci_read(int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_write(int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
-extern uint32_t pci_tread(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_twrite(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
-
-#endif /* PCI_H */
diff --git a/src/includes/pci_hwloc.h b/src/includes/pci_hwloc.h
new file mode 100644
index 0000000..fd7db29
--- /dev/null
+++ b/src/includes/pci_hwloc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_hwloc.h
+ *
+ *      Description:  Header File hwloc based PCI lookup backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_HWLOC_H
+#define PCI_HWLOC_H
+
+extern int hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_proc.h b/src/includes/pci_proc.h
new file mode 100644
index 0000000..062daa9
--- /dev/null
+++ b/src/includes/pci_proc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_proc.h
+ *
+ *      Description:  Header File procfs based PCI lookup backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_PROC_H
+#define PCI_PROC_H
+
+extern int proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_types.h b/src/includes/pci_types.h
index cfb9657..7e8495b 100644
--- a/src/includes/pci_types.h
+++ b/src/includes/pci_types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Types file for pci module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,23 +35,69 @@
 
 #include <stdint.h>
 
+
+
+typedef enum {
+    NODEVTYPE = 0,
+    R3QPI,
+    R2PCIE,
+    IMC,
+    HA,
+    QPI,
+    IRP,
+    MAX_NUM_PCI_TYPES
+} PciDeviceType;
+
 typedef enum {
-    PCI_R3QPI_DEVICE_LINK_0 = 0,
+    MSR_DEV = 0,
+    PCI_R3QPI_DEVICE_LINK_0,
     PCI_R3QPI_DEVICE_LINK_1,
+    PCI_R3QPI_DEVICE_LINK_2,
     PCI_R2PCIE_DEVICE,
-    PCI_IMC_DEVICE_CH_0,
-    PCI_IMC_DEVICE_CH_1,
-    PCI_IMC_DEVICE_CH_2,
-    PCI_IMC_DEVICE_CH_3,
-    PCI_HA_DEVICE,
+    PCI_IMC_DEVICE_0_CH_0,
+    PCI_IMC_DEVICE_0_CH_1,
+    PCI_IMC_DEVICE_0_CH_2,
+    PCI_IMC_DEVICE_0_CH_3,
+    PCI_HA_DEVICE_0,
+    PCI_HA_DEVICE_1,
     PCI_QPI_DEVICE_PORT_0,
     PCI_QPI_DEVICE_PORT_1,
+    PCI_QPI_DEVICE_PORT_2,
     PCI_QPI_MASK_DEVICE_PORT_0,
     PCI_QPI_MASK_DEVICE_PORT_1,
+    PCI_QPI_MASK_DEVICE_PORT_2,
     PCI_QPI_MISC_DEVICE_PORT_0,
     PCI_QPI_MISC_DEVICE_PORT_1,
-    MAX_NUM_DEVICES
+    PCI_QPI_MISC_DEVICE_PORT_2,
+    PCI_IMC_DEVICE_1_CH_0,
+    PCI_IMC_DEVICE_1_CH_1,
+    PCI_IMC_DEVICE_1_CH_2,
+    PCI_IMC_DEVICE_1_CH_3,
+    PCI_IRP_DEVICE,
+    MAX_NUM_PCI_DEVICES
 } PciDeviceIndex;
 
+typedef struct {
+    PciDeviceType type;
+    char *path;
+    char *name;
+    char *likwid_name;
+    uint32_t devid;
+    int  online;
+} PciDevice;
+
+typedef struct {
+    char* name;
+    char* desc;
+} PciType;
+
 
+static PciType pci_types[MAX_NUM_PCI_TYPES] = {
+    [R3QPI] = {"R3QPI", "R3QPI is the interface between the Intel QPI Link Layer and the Ring."},
+    [R2PCIE] = {"R2PCIE", "R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe."},
+    [IMC] = {"IMC", "The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent."},
+    [HA] = {"HA", "The HA is responsible for the protocol side of memory interactions."},
+    [QPI] = {"QPI", "The Intel QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface."},
+    [IRP] = {"IRP", "IRP is responsible for maintaining coherency for IIO traffic e.g. crosssocket P2P."}
+};
 #endif /*PCI_TYPES_H*/
diff --git a/src/includes/perfgroup.h b/src/includes/perfgroup.h
new file mode 100644
index 0000000..c4f25ec
--- /dev/null
+++ b/src/includes/perfgroup.h
@@ -0,0 +1,94 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  configuration.h
+ *
+ *      Description:  Header File of performance group and event set handler
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PERFGROUP_H
+#define PERFGROUP_H
+
+
+ /*! \brief The groupInfo data structure describes a performance group
+
+Groups can be either be read in from file or be a group with custom event set. For
+performance groups commonly all values are set. For groups with custom event set,
+the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in
+general the nmetrics value is 0.
+*/
+typedef struct {
+    char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */
+    char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */
+    int nevents; /*!< \brief Number of event/counter combinations */
+    char** events; /*!< \brief List of events */
+    char** counters; /*!< \brief List of counter registers */
+    int nmetrics; /*!< \brief Number of metrics */
+    char** metricnames; /*!< \brief Metric names */
+    char** metricformulas; /*!< \brief Metric formulas */
+    char* longinfo; /*!< \brief Descriptive text about the group or empty */
+} GroupInfo;
+
+typedef struct {
+    int counters; /*!< \brief Number of entries in the list */
+    char** cnames; /*!< \brief List of counter names */
+    double* cvalues; /*!< \brief List of counter values */
+} CounterList;
+
+typedef enum {
+    GROUP_NONE = 0,
+    GROUP_SHORT,
+    GROUP_EVENTSET,
+    GROUP_METRICS,
+    GROUP_LONG
+} GroupFileSections;
+
+static char* groupFileSectionNames[5] = {
+    "NONE",
+    "SHORT",
+    "EVENTSET",
+    "METRICS",
+    "LONG"
+};
+
+extern int get_groups(char* grouppath, char* architecture, char*** groupnames, char*** groupshort, char*** grouplong);
+extern void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong);
+extern int read_group(char* grouppath, char* architecture, char* groupname, GroupInfo* ginfo);
+extern int custom_group(char* eventStr, GroupInfo* ginfo);
+extern char* get_eventStr(GroupInfo* ginfo);
+void put_eventStr(char* eventset);
+extern char* get_shortInfo(GroupInfo* ginfo);
+void put_shortInfo(char* sinfo);
+extern char* get_longInfo(GroupInfo* ginfo);
+void put_longInfo(char* linfo);
+extern void return_group(GroupInfo* ginfo);
+
+extern void init_clist(CounterList* clist);
+extern int add_to_clist(CounterList* clist, char* counter, double result);
+extern void destroy_clist(CounterList* clist);
+
+extern int calc_metric(char* formula, CounterList* clist, double *result);
+
+#endif
diff --git a/src/includes/perfmon.h b/src/includes/perfmon.h
index 6e9d9f9..37058c1 100644
--- a/src/includes/perfmon.h
+++ b/src/includes/perfmon.h
@@ -7,13 +7,14 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,76 +34,27 @@
 #ifndef PERFMON_H
 #define PERFMON_H
 
-#include <bstrlib.h>
-#include <types.h>
-
-extern int perfmon_verbose;
-
-extern void (*perfmon_startCountersThread) (int thread_id);
-extern void (*perfmon_stopCountersThread) (int thread_id);
-extern int  (*perfmon_getIndex) (bstring reg, PerfmonCounterIndex* index);
-extern void (*perfmon_setupCounterThread) (int thread_id, PerfmonEvent* event , PerfmonCounterIndex index);
-
-extern void perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set);
-extern void perfmon_setCSVMode(int v);
-extern void perfmon_printAvailableGroups(void);
-extern void perfmon_printGroupHelp(bstring group);
-extern void perfmon_init(int numThreads, int threads[],FILE* outstream);
-extern void perfmon_finalize(void);
-extern void perfmon_setupEventSet(bstring eventString, BitMask* mask);
-extern double perfmon_getEventResult(int thread, int index);
-extern int perfmon_setupEventSetC(char* eventCString, const char*** eventnames);
-
-
-/*
-The following structure and set of functions provide an efficient and easy interface to
-access counters from different groups and switch between them.
-
-TODO: The internals need some cleanup, but the interface should remain rather stable.
-
-Usage:
-setup = perfmon_prepareEventSetup("VIEW"), etc..
-Whenever you want to use one of the prepared setups call:
-perfmon_setupCountersForEventSet(setup)
 
-then you can startCounters, stopCounters and then
-perfmon_getEventCounterValues() and/or
-perfmon_getDerivedCounterValues()
- */
-typedef struct {
-    const char* groupName;
-    int numberOfEvents;
-    const char** eventNames;
-    int numberOfDerivedCounters;
-    const char** derivedNames;    
-
-    // Internal structures DO NOT ACCESS THEM, they need cleanup.
-    StrUtilEventSet* eventSetConfig;
-    PerfmonEventSet* perfmon_set;
-    PerfmonGroup groupSet;
-    int groupIndex;
-} EventSetup;
+#include <types.h>
+#include <likwid.h>
 
+#define FREEZE_FLAG_ONLYFREEZE 0x0ULL
+#define FREEZE_FLAG_CLEAR_CTR (1ULL<<1)
+#define FREEZE_FLAG_CLEAR_CTL (1ULL<<0)
 
-extern EventSetup perfmon_prepareEventSetup(char* eventGroupString);
-extern void perfmon_setupCountersForEventSet(EventSetup * setup);
+extern uint64_t currentConfig[MAX_NUM_THREADS][NUM_PMC];
 
-// obtain values for all cores, average, min and max for the cores.
-extern void perfmon_getEventCounterValues(uint64_t* avg_values, uint64_t* max, uint64_t* min);
-extern void perfmon_getDerivedCounterValues(float* avg_values, float* max, float* min);
-/////////////////////////
+extern int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*initThreadArch) (int cpu_id);
 
-extern void perfmon_setupCounters(void);
-extern void perfmon_startCounters(void);
-extern void perfmon_stopCounters(void);
-extern void perfmon_readCounters(void);
-extern double perfmon_getResult(int threadId, char* counterString);
-extern void perfmon_printMarkerResults(bstring filepath);
-extern void perfmon_logCounterResults(double time);
-extern void perfmon_printCounterResults(void);
 
+/* Internal helpers */
+extern int getCounterTypeOffset(int index);
+extern uint64_t perfmon_getMaxCounterValue(RegisterType type);
 
-extern void perfmon_printCounters(void);
-extern void perfmon_printEvents(void);
 
 #endif /*PERFMON_H*/
diff --git a/src/includes/perfmon_atom.h b/src/includes/perfmon_atom.h
index 201cea6..73cc9f9 100644
--- a/src/includes/perfmon_atom.h
+++ b/src/includes/perfmon_atom.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header file of perfmon module for Atom
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +29,7 @@
  */
 
 #include <perfmon_atom_events.h>
-#include <perfmon_atom_groups.h>
+#include <error.h>
 
-static int perfmon_numGroupsAtom = NUM_GROUPS_ATOM;
 static int perfmon_numArchEventsAtom = NUM_ARCH_EVENTS_ATOM;
 
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index 4ca18e4..cb4e2fc 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_atom_events.txt
-# 
+#
 #      Description:  Event list for Intel Atom
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_broadwell.h b/src/includes/perfmon_broadwell.h
new file mode 100644
index 0000000..8e5fc2a
--- /dev/null
+++ b/src/includes/perfmon_broadwell.h
@@ -0,0 +1,1793 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwell.h
+ *
+ *      Description:  Header File of perfmon module for Intel Broadwell.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_broadwell_events.h>
+#include <perfmon_broadwell_counters.h>
+#include <perfmon_broadwelld_events.h>
+#include <perfmon_broadwelld_counters.h>
+#include <perfmon_broadwellEP_events.h>
+#include <perfmon_broadwellEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+
+static int perfmon_numCountersBroadwell = NUM_COUNTERS_BROADWELL;
+static int perfmon_numCoreCountersBroadwell = NUM_COUNTERS_CORE_BROADWELL;
+static int perfmon_numArchEventsBroadwell = NUM_ARCH_EVENTS_BROADWELL;
+
+static int perfmon_numCountersBroadwellD = NUM_COUNTERS_BROADWELLD;
+static int perfmon_numCoreCountersBroadwellD = NUM_COUNTERS_CORE_BROADWELLD;
+static int perfmon_numArchEventsBroadwellD = NUM_ARCH_EVENTS_BROADWELLD;
+
+static int perfmon_numCountersBroadwellEP = NUM_COUNTERS_BROADWELLEP;
+static int perfmon_numCoreCountersBroadwellEP = NUM_COUNTERS_CORE_BROADWELLEP;
+static int perfmon_numArchEventsBroadwellEP = NUM_ARCH_EVENTS_BROADWELLEP;
+
+int bdw_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*broadwell_cbox_setup)(int, RegisterIndex, PerfmonEvent *);
+
+int perfmon_init_broadwell(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    if ((cpuid_info.model == BROADWELL_E) || (cpuid_info.model == BROADWELL_D))
+    {
+        broadwell_cbox_setup = bdwep_cbox_setup;
+    }
+    else
+    {
+        broadwell_cbox_setup = bdw_cbox_setup;
+    }
+    return 0;
+}
+
+
+uint32_t bdw_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int bdw_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_IN_TRANS:
+                    flags |= (1ULL<<32);
+                    break;
+                case EVENT_OPTION_IN_TRANS_ABORT:
+                    flags |= (1ULL<<33);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdwep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter_flags0 = 0x0ULL;
+    uint64_t filter_flags1 = 0x0ULL;
+    uint32_t filter0 = box_map[counter_map[index].type].filterRegister1;
+    uint32_t filter1 = box_map[counter_map[index].type].filterRegister2;
+    int set_state_all = 0;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->eventId == 0x34)
+    {
+        set_state_all = 1;
+    }
+    if ((event->eventId == 0x13 || event->eventId == 0x11) && (event->umask & 0x2ULL))
+    {
+        fprintf(stderr, "IRQ_REJECTED should not be Ored with the other umasks.");
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    filter_flags1 |= (0x3<<27);
+                    filter_flags1 |= (extractBitField(event->options[j].value,5,0) << 20);
+                    break;
+                case EVENT_OPTION_NID:
+                    filter_flags1 |= (extractBitField(event->options[j].value,16,0));
+                    break;
+                case EVENT_OPTION_STATE:
+                    filter_flags0 |= (extractBitField(event->options[j].value,6,0) << 17);
+                    set_state_all = 0;
+                    break;
+                case EVENT_OPTION_TID:
+                    filter_flags0 |= (extractBitField(event->options[j].value,6,0));
+                    flags |= (1ULL<<19);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter_flags1 |= (extractBitField(event->options[j].value,2,0) << 30);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    
+    if (filter_flags0 != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, filter0, filter_flags0, SETUP_CBOX_FILTER0);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags0));
+    }
+    else
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, 0x0ULL));
+    }
+    if (filter_flags1 != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, filter1, filter_flags1, SETUP_CBOX_FILTER1);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags1));
+    }
+    else
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, 0x0ULL));
+    }
+
+    if (set_state_all)
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags0));
+        filter_flags0 |= (0x1F << 17);
+        VERBOSEPRINTREG(cpu_id, filter0, filter_flags0, SETUP_CBOX_DEF_FILTER_STATE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags0));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter = box_map[counter_map[index].type].filterRegister1;
+    int clean_filter = 1;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= event->eventId;
+    if ((event->umask > 0x00) && (event->umask <= 0x3))
+    {
+        flags |= (event->umask << 14);
+    }
+    else if (event->umask == 0xFF)
+    {
+        flags = (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                case EVENT_OPTION_OCCUPANCY:
+                    flags |= ((event->options[j].value & 0x3ULL)<<14);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_FILTER:
+                    clean_filter = 0;
+                    VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), SETUP_WBOX_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, (event->options[j].value & 0xFFFFFFFFULL)));
+                    break;
+                case EVENT_OPTION_OCCUPANCY_EDGE:
+                    flags |= (1ULL<<31);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_INVERT:
+                    flags |= (1ULL<<30);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (clean_filter)
+    {
+        VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), CLEAN_WBOX_FILTER);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, 0x0ULL));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter = 0x0ULL;
+    int opcode_flag = 0;
+    int match_flag = 0;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL)));
+                    opcode_flag = 1;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+                    filter = (((event->options[j].value>>32) & 0x3FFFULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+                    match_flag = 1;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (!opcode_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL, CLEAR_BBOX_OPCODE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL));
+    }
+    if (!match_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL, CLEAR_BBOX_MATCH0);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL));
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL, CLEAR_BBOX_MATCH1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_mboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20)|(1ULL<<22);
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_MBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_IBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (1ULL<<19);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+        flags |= (1ULL<<22);
+        /* Due to an issue found with the Intel® Xeon® Processor E5 and E7 v4 Product Families
+         * hardware, it will be necessary to write each control register twice in a row in order for
+         * the Event Select field to take hold. It is recommended that SW perform the first write
+         * with the enable bit set to 0 followed by a write of the same control register value but
+         * with the enable bit set to 1.*/
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX_TWICE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int bdw_qbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filterreg;
+    uint64_t filterval = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits == 0x01)
+    {
+        flags |= (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH2:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH3:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_RX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK2:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK3:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_QBOX_TX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_QBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+#define BDW_FREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+    }
+
+#define BDW_UNFREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+    }
+
+#define BDW_UNFREEZE_UNCORE_AND_RESET_CTR \
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL))) \
+    { \
+        for (int i=0;i < eventSet->numberOfEvents;i++) \
+        { \
+            RegisterIndex index = eventSet->events[i].index; \
+            RegisterType type = counter_map[index].type; \
+            if ((type < UNCORE) || (type == WBOX0FIX)) \
+            { \
+                continue; \
+            } \
+            PciDeviceIndex dev = counter_map[index].device; \
+            if (HPMcheck(dev, cpu_id)) { \
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_MANUAL); \
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL)); \
+                if (counter_map[index].counterRegister2 != 0x0) \
+                { \
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR_MANUAL); \
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL)); \
+                } \
+            } \
+        } \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+    }
+
+int perfmon_setupCounterThread_broadwell(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    BDW_FREEZE_UNCORE;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                bdw_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= bdw_fixed_setup(cpu_id, index, event);
+                break;
+
+            case POWER:
+            case THERMAL:
+                break;
+
+            case UBOX:
+                bdw_ubox_setup(cpu_id, index, event);
+                break;
+            case UBOXFIX:
+                if (haveLock)
+                {
+                    flags = (1ULL<<22)|(1ULL<<20);
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOXFIX);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                }
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+            case CBOX10:
+            case CBOX11:
+            case CBOX12:
+            case CBOX13:
+            case CBOX14:
+            case CBOX15:
+                broadwell_cbox_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                bdw_bbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                bdw_wbox_setup(cpu_id, index, event);
+                break;
+            case WBOX0FIX:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+            case MBOX4:
+            case MBOX5:
+            case MBOX6:
+            case MBOX7:
+                bdw_mbox_setup(cpu_id, index, event);
+                break;
+            case MBOX0FIX:
+            case MBOX1FIX:
+            case MBOX2FIX:
+            case MBOX3FIX:
+            case MBOX4FIX:
+            case MBOX5FIX:
+            case MBOX6FIX:
+            case MBOX7FIX:
+                bdw_mboxfix_setup(cpu_id, index, event);
+                break;
+
+            case PBOX:
+                bdw_pbox_setup(cpu_id, index, event);
+                break;
+
+            case IBOX0:
+            case IBOX1:
+                bdw_ibox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+            case RBOX1:
+                bdw_rbox_setup(cpu_id, index, event);
+                break;
+
+            case SBOX0:
+            case SBOX1:
+            case SBOX2:
+            case SBOX3:
+                bdw_sbox_setup(cpu_id, index, event);
+                break;
+
+            case QBOX0:
+                bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case QBOX1:
+                bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+            case QBOX2:
+                bdw_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_2);
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
+
+int perfmon_startCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
+
+                case POWER:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_WBOXFIX);
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    if (haveLock && HPMcheck(dev, cpu_id))
+                    {
+                        if (eventSet->events[i].event.eventId != 0x00)
+                        {
+                            CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                            VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_QBOXFIX);
+                            eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                        }
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+        }
+    }
+
+    BDW_UNFREEZE_UNCORE_AND_RESET_CTR;
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int bdw_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+                     uint64_t* cur_result, int* overflows, int flags,
+                     int global_offset, int box_offset)
+{
+    uint64_t result = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    uint64_t counter1 = counter_map[index].counterRegister;
+    uint64_t counter2 = counter_map[index].counterRegister2;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &result));
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST result, READ_REG_1);
+    if (flags & FREEZE_FLAG_CLEAR_CTR)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST 0x0U, CLEAR_PCI_REG_1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+    }
+    if (counter2 != 0x0)
+    {
+        result <<= 32;
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST tmp, READ_REG_2);
+        result += tmp;
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST 0x0U, CLEAR_PCI_REG_2);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+        }
+    }
+    result = field64(result, 0, box_map[type].regWidth);
+    if (result < *cur_result)
+    {
+        uint64_t ovf_values = 0x0ULL;
+        int global_offset = box_map[type].ovflOffset;
+        int test_local = 0;
+        if (global_offset != -1)
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                           MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                           &ovf_values));
+            VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values, READ_GLOBAL_OVFL);
+            if (ovf_values & (1<<global_offset))
+            {
+                VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST (1<<global_offset), CLEAR_GLOBAL_OVFL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                 MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                                 (1<<global_offset)));
+                test_local = 1;
+            }
+        }
+        else
+        {
+            test_local = 1;
+        }
+
+        if (test_local)
+        {
+            ovf_values = 0x0ULL;
+            CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
+            VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST ovf_values, READ_BOX_OVFL);
+            if (ovf_values & (1<<box_offset))
+            {
+                (*overflows)++;
+                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST (1<<box_offset), RESET_BOX_OVFL);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+                                                    box_map[type].statusRegister,
+                                                    (1<<box_offset)));
+            }
+        }
+    }
+    *cur_result = result;
+    return 0;
+}
+
+
+#define BDW_CHECK_CORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+    }
+
+int perfmon_stopCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    BDW_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            int ovf_offset = box_map[type].ovflOffset;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    BDW_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    BDW_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    break;
+
+                case BBOX0:
+                case BBOX1:
+                    bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, 0);
+                    break;
+
+                case IBOX1:
+                    bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+2);
+                    break;
+
+                case PBOX:
+                case IBOX0:
+                case WBOX:
+                case UBOX:
+                case UBOXFIX:
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case CBOX16:
+                case CBOX17:
+                case CBOX18:
+                case CBOX19:
+                case CBOX20:
+                case CBOX21:
+                case CBOX22:
+                case CBOX23:
+                case RBOX0:
+                case RBOX1:
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                case QBOX0:
+                case QBOX1:
+                case QBOX2:
+                    bdw_uncore_read(cpu_id, index, event, &counter_result, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                case QBOX2FIX:
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
+                    break;
+
+                default:
+                    break;
+            }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+
+    return 0;
+}
+
+
+int perfmon_readCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+    }
+    BDW_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            counter_result= 0x0ULL;
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            int ovf_offset = box_map[type].ovflOffset;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    BDW_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    BDW_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    bdw_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    bdw_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, 0);
+                    break;
+
+                case IBOX1:
+                    bdw_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index)+2);
+                    break;
+
+                case BBOX0:
+                case BBOX1:
+                case PBOX:
+                case IBOX0:
+                case WBOX:
+                case UBOX:
+                case UBOXFIX:
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case RBOX0:
+                case RBOX1:
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                case QBOX0:
+                case QBOX1:
+                case QBOX2:
+                    bdw_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    *current = counter_result;
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+    BDW_UNFREEZE_UNCORE;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int perfmon_finalizeCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+    uint64_t ovf_values_uncore = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_uncore));
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_uncore, SHOW_CTL);
+            ovf_values_uncore = 0x0ULL;
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_uncore, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_broadwellEP_counters.h b/src/includes/perfmon_broadwellEP_counters.h
new file mode 100644
index 0000000..d37c871
--- /dev/null
+++ b/src/includes/perfmon_broadwellEP_counters.h
@@ -0,0 +1,362 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwellEP_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Broadwell EP/EN/EX.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#define NUM_COUNTERS_BROADWELLEP 216
+#define NUM_COUNTERS_CORE_BROADWELLEP 8
+#define NUM_COUNTERS_UNCORE_BROADWELLEP 85
+
+#define BDW_EP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_EP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_EP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_STATE_MASK|\
+            EVENT_OPTION_MATCH0_MASK
+#define BDW_EP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define BDW_EP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_EP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define BDW_EP_VALID_OPTIONS_QBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap broadwellEP_counter_map[NUM_COUNTERS_BROADWELLEP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_EP_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_EP_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX0", PMC12, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0,  0, 0, BDW_EP_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC13, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1,  0, 0, BDW_EP_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC14, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC16, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC17, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC18, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC20, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC21, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC22, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC24, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC25, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC26, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC28, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC29, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC30, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC32, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC33, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC34, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC36, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC37, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC38, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC40, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC41, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC42, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC44, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC45, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC46, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC48, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC49, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC50, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC52, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC53, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC54, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC56, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC57, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC58, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC60, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC61, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC62, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC64, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC65, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC66, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC68, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC69, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC70, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC72, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC73, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC74, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C0", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C1", PMC76, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C2", PMC77, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C3", PMC78, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C0", PMC79, CBOX16, MSR_UNC_V3_C16_PMON_CTL0, MSR_UNC_V3_C16_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C1", PMC80, CBOX16, MSR_UNC_V3_C16_PMON_CTL1, MSR_UNC_V3_C16_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C2", PMC81, CBOX16, MSR_UNC_V3_C16_PMON_CTL2, MSR_UNC_V3_C16_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C3", PMC82, CBOX16, MSR_UNC_V3_C16_PMON_CTL3, MSR_UNC_V3_C16_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C0", PMC83, CBOX17, MSR_UNC_V3_C17_PMON_CTL0, MSR_UNC_V3_C17_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C1", PMC84, CBOX17, MSR_UNC_V3_C17_PMON_CTL1, MSR_UNC_V3_C17_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C2", PMC85, CBOX17, MSR_UNC_V3_C17_PMON_CTL2, MSR_UNC_V3_C17_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C3", PMC86, CBOX17, MSR_UNC_V3_C17_PMON_CTL3, MSR_UNC_V3_C17_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX18C0", PMC87, CBOX18, MSR_UNC_V3_C18_PMON_CTL0, MSR_UNC_V3_C18_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX18C1", PMC88, CBOX18, MSR_UNC_V3_C18_PMON_CTL1, MSR_UNC_V3_C18_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX18C2", PMC89, CBOX18, MSR_UNC_V3_C18_PMON_CTL2, MSR_UNC_V3_C18_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX18C3", PMC90, CBOX18, MSR_UNC_V3_C18_PMON_CTL3, MSR_UNC_V3_C18_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX19C0", PMC91, CBOX19, MSR_UNC_V3_C19_PMON_CTL0, MSR_UNC_V3_C19_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX19C1", PMC92, CBOX19, MSR_UNC_V3_C19_PMON_CTL1, MSR_UNC_V3_C19_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX19C2", PMC93, CBOX19, MSR_UNC_V3_C19_PMON_CTL2, MSR_UNC_V3_C19_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX19C3", PMC94, CBOX19, MSR_UNC_V3_C19_PMON_CTL3, MSR_UNC_V3_C19_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX20C0", PMC95, CBOX20, MSR_UNC_V3_C20_PMON_CTL0, MSR_UNC_V3_C20_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX20C1", PMC96, CBOX20, MSR_UNC_V3_C20_PMON_CTL1, MSR_UNC_V3_C20_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX20C2", PMC97, CBOX20, MSR_UNC_V3_C20_PMON_CTL2, MSR_UNC_V3_C20_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX20C3", PMC98, CBOX20, MSR_UNC_V3_C20_PMON_CTL3, MSR_UNC_V3_C20_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX21C0", PMC99, CBOX21, MSR_UNC_V3_C21_PMON_CTL0, MSR_UNC_V3_C21_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX21C1", PMC100, CBOX21, MSR_UNC_V3_C21_PMON_CTL1, MSR_UNC_V3_C21_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX21C2", PMC101, CBOX21, MSR_UNC_V3_C21_PMON_CTL2, MSR_UNC_V3_C21_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX21C3", PMC102, CBOX21, MSR_UNC_V3_C21_PMON_CTL3, MSR_UNC_V3_C21_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX22C0", PMC103, CBOX22, MSR_UNC_V3_C22_PMON_CTL0, MSR_UNC_V3_C22_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX22C1", PMC104, CBOX22, MSR_UNC_V3_C22_PMON_CTL1, MSR_UNC_V3_C22_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX22C2", PMC105, CBOX22, MSR_UNC_V3_C22_PMON_CTL2, MSR_UNC_V3_C22_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX22C3", PMC106, CBOX22, MSR_UNC_V3_C22_PMON_CTL3, MSR_UNC_V3_C22_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX23C0", PMC107, CBOX23, MSR_UNC_V3_C23_PMON_CTL0, MSR_UNC_V3_C23_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX23C1", PMC108, CBOX23, MSR_UNC_V3_C23_PMON_CTL1, MSR_UNC_V3_C23_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX23C2", PMC109, CBOX23, MSR_UNC_V3_C23_PMON_CTL2, MSR_UNC_V3_C23_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"CBOX23C3", PMC110, CBOX23, MSR_UNC_V3_C23_PMON_CTL3, MSR_UNC_V3_C23_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_CBOX},
+    {"WBOX0", PMC111, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC112, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC113, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC114, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, BDW_EP_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC115, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC116, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"BBOX0C0", PMC117, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC118, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC119, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC120, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC121, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC122, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC123, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC124, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, BDW_EP_VALID_OPTIONS_BBOX},
+    {"MBOX0C0", PMC125, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C1", PMC126, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C2", PMC127, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC128, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_INVERT_MASK},
+    {"MBOX0C3", PMC129, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C0", PMC130, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C1", PMC131, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C2", PMC132, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C3", PMC133, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX", PMC134, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_INVERT_MASK},
+    {"MBOX2C0", PMC135, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C1", PMC136, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C2", PMC137, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C3", PMC138, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX", PMC139, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_INVERT_MASK},
+    {"MBOX3C0", PMC140, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C1", PMC141, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C2", PMC142, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C3", PMC143, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX", PMC144, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_INVERT_MASK},
+    {"MBOX4C0", PMC145, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C1", PMC146, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C2", PMC147, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C3", PMC148, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4FIX", PMC149, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_INVERT_MASK},
+    {"MBOX5C0", PMC150, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C1", PMC151, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C2", PMC152, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C3", PMC153, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5FIX", PMC154, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_INVERT_MASK},
+    {"MBOX6C0", PMC155, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C1", PMC156, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C2", PMC157, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C3", PMC158, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6FIX", PMC159, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_INVERT_MASK},
+    {"MBOX7C0", PMC160, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C1", PMC161, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C2", PMC162, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C3", PMC163, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, BDW_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7FIX", PMC164, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_INVERT_MASK},
+    {"IBOX0C0", PMC165, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC166, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC167, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC168, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_EP_VALID_OPTIONS_IBOX},
+    {"PBOX0", PMC169, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC170, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC171, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC172, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, BDW_EP_VALID_OPTIONS_PBOX},
+    {"RBOX0C0", PMC173, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC174, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC175, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, BDW_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC176, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC177, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC178, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, BDW_EP_VALID_OPTIONS_RBOX},
+    {"SBOX0C0", PMC179, SBOX0, MSR_UNC_V3_S0_PMON_CTL_0, MSR_UNC_V3_S0_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C1", PMC180, SBOX0, MSR_UNC_V3_S0_PMON_CTL_1, MSR_UNC_V3_S0_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C2", PMC181, SBOX0, MSR_UNC_V3_S0_PMON_CTL_2, MSR_UNC_V3_S0_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C3", PMC182, SBOX0, MSR_UNC_V3_S0_PMON_CTL_3, MSR_UNC_V3_S0_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C0", PMC183, SBOX1, MSR_UNC_V3_S1_PMON_CTL_0, MSR_UNC_V3_S1_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C1", PMC184, SBOX1, MSR_UNC_V3_S1_PMON_CTL_1, MSR_UNC_V3_S1_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C2", PMC185, SBOX1, MSR_UNC_V3_S1_PMON_CTL_2, MSR_UNC_V3_S1_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C3", PMC186, SBOX1, MSR_UNC_V3_S1_PMON_CTL_3, MSR_UNC_V3_S1_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C0", PMC187, SBOX2, MSR_UNC_V3_S2_PMON_CTL_0, MSR_UNC_V3_S2_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C1", PMC188, SBOX2, MSR_UNC_V3_S2_PMON_CTL_1, MSR_UNC_V3_S2_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C2", PMC189, SBOX2, MSR_UNC_V3_S2_PMON_CTL_2, MSR_UNC_V3_S2_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C3", PMC190, SBOX2, MSR_UNC_V3_S2_PMON_CTL_3, MSR_UNC_V3_S2_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C0", PMC191, SBOX3, MSR_UNC_V3_S3_PMON_CTL_0, MSR_UNC_V3_S3_PMON_CTR_0, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C1", PMC192, SBOX3, MSR_UNC_V3_S3_PMON_CTL_1, MSR_UNC_V3_S3_PMON_CTR_1, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C2", PMC193, SBOX3, MSR_UNC_V3_S3_PMON_CTL_2, MSR_UNC_V3_S3_PMON_CTR_2, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C3", PMC194, SBOX3, MSR_UNC_V3_S3_PMON_CTL_3, MSR_UNC_V3_S3_PMON_CTR_3, 0, 0, BDW_EP_VALID_OPTIONS_SBOX},
+    {"QBOX0C0", PMC195, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C1", PMC196, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C2", PMC197, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C3", PMC198, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C0", PMC199, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C1", PMC200, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C2", PMC201, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C3", PMC202, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX2C0", PMC203, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX2C1", PMC204, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX2C2", PMC205, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX2C3", PMC206, QBOX2, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_2, BDW_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0FIX0", PMC207, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX1", PMC208, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX2", PMC209, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX0", PMC210, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX1", PMC211, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX2", PMC212, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX2FIX0", PMC213, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+    {"QBOX2FIX1", PMC214, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+    {"QBOX2FIX2", PMC215, QBOX2FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+};
+
+static BoxMap broadwellEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+    [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+    [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+    [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+    [CBOX16] = {MSR_UNC_V3_C16_PMON_BOX_CTL, MSR_UNC_V3_C16_PMON_BOX_STATUS, MSR_UNC_V3_C16_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C16_PMON_BOX_FILTER0, MSR_UNC_V3_C16_PMON_BOX_FILTER1},
+    [CBOX17] = {MSR_UNC_V3_C17_PMON_BOX_CTL, MSR_UNC_V3_C17_PMON_BOX_STATUS, MSR_UNC_V3_C17_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C17_PMON_BOX_FILTER0, MSR_UNC_V3_C17_PMON_BOX_FILTER1},
+    [CBOX18] = {MSR_UNC_V3_C18_PMON_BOX_CTL, MSR_UNC_V3_C18_PMON_BOX_STATUS, MSR_UNC_V3_C18_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C18_PMON_BOX_FILTER0, MSR_UNC_V3_C18_PMON_BOX_FILTER1},
+    [CBOX19] = {MSR_UNC_V3_C19_PMON_BOX_CTL, MSR_UNC_V3_C19_PMON_BOX_STATUS, MSR_UNC_V3_C19_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C19_PMON_BOX_FILTER0, MSR_UNC_V3_C19_PMON_BOX_FILTER1},
+    [CBOX20] = {MSR_UNC_V3_C20_PMON_BOX_CTL, MSR_UNC_V3_C20_PMON_BOX_STATUS, MSR_UNC_V3_C20_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C20_PMON_BOX_FILTER0, MSR_UNC_V3_C20_PMON_BOX_FILTER1},
+    [CBOX21] = {MSR_UNC_V3_C21_PMON_BOX_CTL, MSR_UNC_V3_C21_PMON_BOX_STATUS, MSR_UNC_V3_C21_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C21_PMON_BOX_FILTER0, MSR_UNC_V3_C21_PMON_BOX_FILTER1},
+    [CBOX22] = {MSR_UNC_V3_C22_PMON_BOX_CTL, MSR_UNC_V3_C22_PMON_BOX_STATUS, MSR_UNC_V3_C22_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C22_PMON_BOX_FILTER0, MSR_UNC_V3_C22_PMON_BOX_FILTER1},
+    [CBOX23] = {MSR_UNC_V3_C23_PMON_BOX_CTL, MSR_UNC_V3_C23_PMON_BOX_STATUS, MSR_UNC_V3_C23_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C23_PMON_BOX_FILTER0, MSR_UNC_V3_C23_PMON_BOX_FILTER1},
+    [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0,0,0,-1,0,0,64},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 22, 1, PCI_HA_DEVICE_1, 48},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 27, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+    [SBOX0] = {MSR_UNC_V3_S0_PMON_BOX_CTL, MSR_UNC_V3_S0_PMON_BOX_STATUS, MSR_UNC_V3_S0_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX1] = {MSR_UNC_V3_S1_PMON_BOX_CTL, MSR_UNC_V3_S1_PMON_BOX_STATUS, MSR_UNC_V3_S1_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX2] = {MSR_UNC_V3_S2_PMON_BOX_CTL, MSR_UNC_V3_S2_PMON_BOX_STATUS, MSR_UNC_V3_S2_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX3] = {MSR_UNC_V3_S3_PMON_BOX_CTL, MSR_UNC_V3_S3_PMON_BOX_STATUS, MSR_UNC_V3_S3_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [QBOX2] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
+    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+};
+
+static PciDevice broadwellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "MSR", ""},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x6F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x6F38},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x6FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x6FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x6FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x6FB1},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x6FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x6FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x6FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x6FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX0", 0x6F39},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x6F34},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "QBOX0", 0x6F32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "QBOX1", 0x6F33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "QBOX2", 0x6F3A},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x6F86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x6F96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x6F46},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "QBOX0FIX", 0x6F80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_1", "QBOX1FIX", 0x6F80},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "QBOX2FIX", 0x6F40},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x6F36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x6F37},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "0b.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX1", 0x6F3E},
+};
diff --git a/src/includes/perfmon_broadwellEP_events.txt b/src/includes/perfmon_broadwellEP_events.txt
new file mode 100644
index 0000000..0781ebe
--- /dev/null
+++ b/src/includes/perfmon_broadwellEP_events.txt
@@ -0,0 +1,2569 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_broadwellEP_events.txt
+#
+#      Description:  Event list for Intel Broadwell EP/EN/EX.
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+
+EVENT_INT_MISC                      0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES      0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT       0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES     0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT      0x08
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE         0x10
+UMASK_UOPS_ISSUED_SLOW_LEA            0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL          0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES         0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE    0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE       0x14  PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+
+EVENT_L2_RQSTS                     0x24   PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT  0x41
+UMASK_L2_RQSTS_RFO_MISS            0x22
+UMASK_L2_RQSTS_RFO_HIT             0x42
+UMASK_L2_RQSTS_CODE_RD_MISS        0x24
+UMASK_L2_RQSTS_CODE_RD_HIT         0x44
+UMASK_L2_RQSTS_L2_PF_HIT           0x50
+UMASK_L2_RQSTS_L2_PF_MISS          0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD  0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS     0x27
+UMASK_L2_RQSTS_ALL_RFO             0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD         0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF              0xF8
+UMASK_L2_RQSTS_MISS                0x3F
+UMASK_L2_RQSTS_REFERENCES          0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT        0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT        0x50
+
+EVENT_LONGEST_LAT_CACHE            0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE  0x4F
+UMASK_LONGEST_LAT_CACHE_MISS       0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK  0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE  0x02
+
+EVENT_L1D_PEND_MISS                  0x48   PMC2
+UMASK_L1D_PEND_MISS_PENDING          0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES   EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES   0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES      0x01
+
+EVENT_DTLB_STORE_MISSES                    0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT              0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION         0x10
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_HW_PF         0x02
+
+EVENT_EPT_WALK_CYCLES            0x4F PMC
+UMASK_EPT_WALK_CYCLES            0x10
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT            0x01
+
+EVENT_TX_MEM                                        0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT                         0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE                   0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK         0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH      0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL                0x40
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES                   0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123           0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS   EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS       0x01
+
+EVENT_TX_EXEC                       0x5D PMC
+EVENT_TX_EXEC_MISC1                 0x01
+EVENT_TX_EXEC_MISC2                 0x02
+EVENT_TX_EXEC_MISC3                 0x04
+EVENT_TX_EXEC_MISC4                 0x08
+EVENT_TX_EXEC_MISC5                 0x10
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_LOCK_CYCLES                             0x63   PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES        EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES                  0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES         EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES                   0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES      EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES                0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR       EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR                 0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES     EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES               0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES          EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES                    0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES        EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES                  0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HIT              0x01
+UMASK_ICACHE_MISSES           0x02
+UMASK_ICACHE_ACCESSES         0x03
+
+EVENT_ITLB_MISSES                   0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK     0x01
+UMASK_ITLB_MISSES_STLB_HIT          0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED    0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K       0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION     0x10
+
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP                       0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL                 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN             0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0x03
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE      0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE      0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE      0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE      0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE      0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE      0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS    EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS     0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY    EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY     0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE  0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS    0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS    EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS     0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY    EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY     0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL    EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL       0x04
+
+EVENT_LSD_UOPS                 0xA8   PMC
+UMASK_LSD_UOPS                 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE        0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS        0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB                       0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P            0xC0  PMC
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_X87              0x02
+
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN     0x20
+
+EVENT_FP_ARITH_INST_RETIRED               0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE      0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE      0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR             0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED             0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE             0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE             0x2A
+
+EVENT_HLE_RETIRED                    0xC8 PMC
+UMASK_HLE_RETIRED_START              0x01
+UMASK_HLE_RETIRED_COMMIT             0x02
+UMASK_HLE_RETIRED_ABORTED            0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1      0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2      0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3      0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4      0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_RTM_RETIRED                    0xC9 PMC
+UMASK_RTM_RETIRED_START              0x01
+UMASK_RTM_RETIRED_COMMIT             0x02
+UMASK_RTM_RETIRED_ABORTED            0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1      0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2      0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3      0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4      0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_FP_ASSIST                      0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT           0x02
+UMASK_FP_ASSIST_X87_INPUT            0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT          0x08
+UMASK_FP_ASSIST_SIMD_INPUT           0x10
+UMASK_FP_ASSIST_ANY                  0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS     0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS     0x20
+
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL        0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL       0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
+R
+EVENT_MEM_LOAD_UOPS_RETIRED              0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED           0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT  0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED                 0xD3   PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM      0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM      0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_DRAM     0x04
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM     0x10
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_FWD      0x20
+
+EVENT_BACLEARS                0xE6 PMC
+UMASK_BACLEARS_ANY            0x1F
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PF         0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_EVENT_MSG                     0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD       0x08
+
+EVENT_PHOLD_CYCLES                  0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK    0x01
+
+EVENT_RACU_REQUESTS                 0x46 UBOX
+UMASK_RACU_REQUESTS                 0x00
+
+EVENT_UNCORE_CLOCK                  0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
+
+EVENT_CBOX_CLOCKTICKS               0x00 CBOX
+UMASK_CBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_INSERTS                   0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE          0x01
+UMASK_TXR_INSERTS_AK_CACHE          0x02
+UMASK_TXR_INSERTS_BL_CACHE          0x04
+UMASK_TXR_INSERTS_IV_CACHE          0x08
+UMASK_TXR_INSERTS_AD_CORE           0x10
+UMASK_TXR_INSERTS_AK_CORE           0x20
+UMASK_TXR_INSERTS_BL_CORE           0x40
+
+EVENT_TXR_ADS_USED                  0x04 CBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04 
+
+EVENT_RING_BOUNCES                  0x05 CBOX
+UMASK_RING_BOUNCES_AD               0x01
+UMASK_RING_BOUNCES_AK               0x02
+UMASK_RING_BOUNCES_BL               0x04
+UMASK_RING_BOUNCES_IV               0x10
+
+EVENT_RING_SRC_THRTL                0x07 CBOX
+UMASK_RING_SRC_THRTL                0x00
+
+EVENT_FAST_ASSERTED                 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1|CBOX16C0|CBOX16C1|CBOX17C0|CBOX17C1|CBOX18C0|CBOX18C1|CBOX19C0|CBOX19C1|CBOX20C0|CBOX20C1|CBOX21C0|CBOX21C1|CBOX22C0|CBOX22C1|CBOX23C0|CBOX23C1
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0x0A CBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RING_AD_USED                  0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+UMASK_RING_AD_USED_ANY              0x0F
+
+EVENT_RING_AK_USED                  0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+UMASK_RING_AK_USED_ANY              0x0F
+
+EVENT_RING_BL_USED                  0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+UMASK_RING_BL_USED_ANY              0x0F
+
+EVENT_RING_IV_USED                  0x1E CBOX
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DN               0x0C
+UMASK_RING_IV_USED_ANY              0x0F
+UMASK_RING_IV_USED_DOWN             0x33
+
+EVENT_COUNTER0_OCCUPANCY            0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY            0x00
+DEFAULT_OPTIONS_COUNTER0_OCCUPANCY_COUNT EVENT_OPTION_THRESHOLD=0x01
+UMASK_COUNTER0_OCCUPANCY_COUNT      0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0|CBOX18C0|CBOX19C0|CBOX20C0|CBOX21C0|CBOX22C0|CBOX23C0
+UMASK_RXR_OCCUPANCY_IRQ             0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ         0x02
+UMASK_RXR_OCCUPANCY_IPQ             0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ         0x20
+
+EVENT_RXR_EXT_STARVED               0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                   0x13 CBOX
+UMASK_RXR_INSERTS_IRQ               0x01
+UMASK_RXR_INSERTS_IRQ_REJ           0x02
+UMASK_RXR_INSERTS_IPQ               0x04
+UMASK_RXR_INSERTS_PRQ               0x10
+UMASK_RXR_INSERTS_PRQ_REJ           0x20
+
+EVENT_RXR_IPQ_RETRY                 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY             0x01
+UMASK_RXR_IPQ_RETRY_FULL            0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS     0x10
+
+EVENT_RXR_IPQ_RETRY2                0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO         0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IRQ_RETRY                 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY             0x01
+UMASK_RXR_IRQ_RETRY_FULL            0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IRQ_RETRY_RTID            0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS     0x10
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS     0x20
+OPTIONS_RXR_IRQ_RETRY_NID           EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID             0x40
+
+EVENT_RXR_IRQ_RETRY2                0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_ISMQ_RETRY                0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS    0x20
+OPTIONS_RXR_ISMQ_RETRY_NID          EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID            0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS   EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_RXR_ISMQ_RETRY2                0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET         0x40
+
+EVENT_LLC_LOOKUP                    0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_READ             EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ               0x21
+OPTIONS_LLC_LOOKUP_NID              EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_LLC_VICTIMS                   0x37 CBOX
+UMASK_LLC_VICTIMS_M                 0x01
+UMASK_LLC_VICTIMS_E                 0x02
+UMASK_LLC_VICTIMS_S                 0x04
+UMASK_LLC_VICTIMS_F                 0x08
+UMASK_LLC_VICTIMS_MISS              0x10
+OPTIONS_LLC_VICTIMS_NID             EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID               0x40
+
+
+EVENT_TOR_INSERTS                   0x35 CBOX
+OPTIONS_TOR_INSERTS_OPCODE          EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE            0x01
+OPTIONS_TOR_INSERTS_MISS_OPCODE     EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE       0x03
+UMASK_TOR_INSERTS_EVICTION          0x04
+UMASK_TOR_INSERTS_ALL               0x08
+UMASK_TOR_INSERTS_WB                0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE    EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE      0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL             0x28
+UMASK_TOR_INSERTS_MISS_LOCAL        0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE      EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE        0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE   0x43
+OPTIONS_TOR_INSERTS_NID_EVICION     EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION       0x44
+DEFAULT_OPTIONS_TOR_INSERTS_NID_ALL EVENT_OPTION_STATE=0x01
+OPTIONS_TOR_INSERTS_NID_ALL         EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL           0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL    EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL      0x4A
+OPTIONS_TOR_INSERTS_NID_WB          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB            0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE     0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE            0x88
+UMASK_TOR_INSERTS_MISS_REMOTE       0x8A
+
+EVENT_TOR_OCCUPANCY                 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0|CBOX18C0|CBOX19C0|CBOX20C0|CBOX21C0|CBOX22C0|CBOX23C0
+OPTIONS_TOR_OCCUPANCY_OPCODE        EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE          0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE     0x03
+UMASK_TOR_OCCUPANCY_EVICTION        0x04
+UMASK_TOR_OCCUPANCY_ALL             0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL        0x0A
+UMASK_TOR_OCCUPANCY_WB              0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE    0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL           0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL      0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE    EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE      0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION    0x44
+DEFAULT_OPTIONS_TOR_OCCUPANCY_NID_ALL EVENT_OPTION_STATE=0x01
+OPTIONS_TOR_OCCUPANCY_NID_ALL       EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL         0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL    0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB        EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB          0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE   0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE          0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE     0x8A
+
+EVENT_MISC                          0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE             0x01
+UMASK_MISC_WC_ALIASING              0x02
+UMASK_MISC_STARTED                  0x04
+UMASK_MISC_RFO_HIT_S                0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM   0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS     0x20
+
+EVENT_SBO_CREDITS_ACQUIRED          0x3D CBOX
+UMASK_SBO_CREDITS_ACQUIRED_AD       0x01
+UMASK_SBO_CREDITS_ACQUIRED_BL       0x02
+UMASK_SBO_CREDITS_ACQUIRED_ANY      0x03
+
+EVENT_SBO_CREDIT_OCCUPANCY          0x3E CBOX
+UMASK_SBO_CREDIT_OCCUPANCY_AD       0x01
+UMASK_SBO_CREDIT_OCCUPANCY_BL       0x02
+UMASK_SBO_CREDIT_OCCUPANCY_ANY      0x03
+
+EVENT_WBOX_CLOCKTICKS               0x00 WBOX
+UMASK_WBOX_CLOCKTICKS               0x00
+
+EVENT_CORE0_TRANSITION_CYCLES       0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES       0x00
+
+EVENT_CORE1_TRANSITION_CYCLES       0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES       0x00
+
+EVENT_CORE2_TRANSITION_CYCLES       0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES       0x00
+
+EVENT_CORE3_TRANSITION_CYCLES       0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES       0x00
+
+EVENT_CORE4_TRANSITION_CYCLES       0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES       0x00
+
+EVENT_CORE5_TRANSITION_CYCLES       0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES       0x00
+
+EVENT_CORE6_TRANSITION_CYCLES       0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES       0x00
+
+EVENT_CORE7_TRANSITION_CYCLES       0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES       0x00
+
+EVENT_CORE8_TRANSITION_CYCLES       0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES       0x00
+
+EVENT_CORE9_TRANSITION_CYCLES       0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES       0x00
+
+EVENT_CORE10_TRANSITION_CYCLES       0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES       0x00
+
+EVENT_CORE11_TRANSITION_CYCLES       0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES       0x00
+
+EVENT_CORE12_TRANSITION_CYCLES       0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES       0x00
+
+EVENT_CORE13_TRANSITION_CYCLES       0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES       0x00
+
+EVENT_CORE14_TRANSITION_CYCLES       0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES       0x00
+
+EVENT_CORE15_TRANSITION_CYCLES       0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES       0x00
+
+EVENT_CORE16_TRANSITION_CYCLES       0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES       0x00
+
+EVENT_CORE17_TRANSITION_CYCLES       0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES       0x00
+
+EVENT_FIVR_PS_PS0_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS0_CYCLES             0x00
+
+EVENT_FIVR_PS_PS1_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS1_CYCLES             0x00
+
+EVENT_FIVR_PS_PS2_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS2_CYCLES             0x00
+
+EVENT_FIVR_PS_PS3_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS3_CYCLES             0x00
+
+EVENT_DEMOTIONS_CORE0                0x30 WBOX
+UMASK_DEMOTIONS_CORE0                0x00
+
+EVENT_DEMOTIONS_CORE1                0x31 WBOX
+UMASK_DEMOTIONS_CORE1                0x00
+
+EVENT_DEMOTIONS_CORE2                0x32 WBOX
+UMASK_DEMOTIONS_CORE2                0x00
+
+EVENT_DEMOTIONS_CORE3                0x33 WBOX
+UMASK_DEMOTIONS_CORE3                0x00
+
+EVENT_DEMOTIONS_CORE4                0x34 WBOX
+UMASK_DEMOTIONS_CORE4                0x00
+
+EVENT_DEMOTIONS_CORE5                0x35 WBOX
+UMASK_DEMOTIONS_CORE5                0x00
+
+EVENT_DEMOTIONS_CORE6                0x36 WBOX
+UMASK_DEMOTIONS_CORE6                0x00
+
+EVENT_DEMOTIONS_CORE7                0x37 WBOX
+UMASK_DEMOTIONS_CORE7                0x00
+
+EVENT_DEMOTIONS_CORE8                0x38 WBOX
+UMASK_DEMOTIONS_CORE8                0x00
+
+EVENT_DEMOTIONS_CORE9                0x39 WBOX
+UMASK_DEMOTIONS_CORE9                0x00
+
+EVENT_DEMOTIONS_CORE10                0x3A WBOX
+UMASK_DEMOTIONS_CORE10                0x00
+
+EVENT_DEMOTIONS_CORE11                0x3B WBOX
+UMASK_DEMOTIONS_CORE11                0x00
+
+EVENT_DEMOTIONS_CORE12                0x3C WBOX
+UMASK_DEMOTIONS_CORE12                0x00
+
+EVENT_DEMOTIONS_CORE13                0x3D WBOX
+UMASK_DEMOTIONS_CORE13                0x00
+
+EVENT_DEMOTIONS_CORE14                0x3E WBOX
+UMASK_DEMOTIONS_CORE14                0x00
+
+EVENT_DEMOTIONS_CORE15                0x3F WBOX
+UMASK_DEMOTIONS_CORE15                0x00
+
+EVENT_DEMOTIONS_CORE16                0x40 WBOX
+UMASK_DEMOTIONS_CORE16                0x00
+
+EVENT_DEMOTIONS_CORE17                0x41 WBOX
+UMASK_DEMOTIONS_CORE17                0x00
+
+EVENT_FREQ_BAND0_CYCLES                 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES                 0x00
+
+EVENT_FREQ_BAND1_CYCLES                 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES                 0x00
+
+EVENT_FREQ_BAND2_CYCLES                 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES                 0x00
+
+EVENT_FREQ_BAND3_CYCLES                 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES                 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x00
+
+EVENT_FREQ_MAX_OS_CYCLES                0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES                0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES             0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES             0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES              0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES              0x00
+
+EVENT_FREQ_TRANS_CYCLES                 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES                 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES      0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES      0x00
+
+EVENT_POWER_STATE_OCCUPANCY             0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0    0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3    0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6    0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_VR_HOT_CYCLES                     0x42 WBOX
+UMASK_VR_HOT_CYCLES                     0x00
+
+EVENT_UFS_BANDWIDTH_MAX_RANGE           0x7E WBOX
+UMASK_UFS_BANDWIDTH_MAX_RANGE           0x00
+
+EVENT_UFS_TRANSITIONS_DOWN              0x7C WBOX
+UMASK_UFS_TRANSITIONS_DOWN              0x00
+
+EVENT_UFS_TRANSITIONS_IO_P_LIMIT        0x7D WBOX
+UMASK_UFS_TRANSITIONS_IO_P_LIMIT        0x00
+
+EVENT_UFS_TRANSITIONS_NO_CHANGE         0x79 WBOX
+UMASK_UFS_TRANSITIONS_NO_CHANGE         0x00
+
+EVENT_UFS_TRANSITIONS_UP_RING           0x7A WBOX
+UMASK_UFS_TRANSITIONS_UP_RING           0x00
+
+EVENT_UFS_TRANSITIONS_UP_STALL          0x7B WBOX
+UMASK_UFS_TRANSITIONS_UP_STALL          0x00
+
+EVENT_CORES_IN_C3                       0x00 WBOX0FIX
+UMASK_CORES_IN_C3                       0x00
+
+EVENT_CORES_IN_C6                       0x00 WBOX1FIX
+UMASK_CORES_IN_C6                       0x00
+
+EVENT_BBOX_CLOCKTICKS                   0x00 BBOX
+UMASK_BBOX_CLOCKTICKS                   0x00
+
+EVENT_ADDR_OPC_MATCH                    0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR               0x01
+OPTIONS_ADDR_OPC_MATCH_OPC              EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC                0x02
+OPTIONS_ADDR_OPC_MATCH_FILT             EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT               0x03
+OPTIONS_ADDR_OPC_MATCH_AD               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD                 0x04
+OPTIONS_ADDR_OPC_MATCH_BL               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL                 0x08
+OPTIONS_ADDR_OPC_MATCH_AK               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK                 0x10
+
+EVENT_BT_CYCLES_NE                      0x42 BBOX
+UMASK_BT_CYCLES_NE                      0x00
+
+EVENT_BT_OCCUPANCY                      0x43 BBOX
+UMASK_BT_OCCUPANCY                      0x00
+
+EVENT_BYPASS_IMC                        0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN                  0x01
+UMASK_BYPASS_IMC_NOT_TAKEN              0x02
+
+EVENT_CONFLICT_CYCLES                   0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES                   0x00
+
+EVENT_DIRECT2CORE_COUNT                 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT                 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED       0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED       0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE          0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE          0x00
+
+EVENT_DIRECTORY_LAT_OPT                 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT                 0x00
+
+EVENT_DIRECTORY_LOOKUP                  0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP              0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP           0x02
+
+EVENT_DIRECTORY_UPDATE                  0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET              0x01
+UMASK_DIRECTORY_UPDATE_CLEAR            0x02
+UMASK_DIRECTORY_UPDATE_ANY              0x03
+
+EVENT_HITME_LOOKUP                      0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE         0x01
+UMASK_HITME_LOOKUP_WBMTOI                  0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI             0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S             0x08
+UMASK_HITME_LOOKUP_HOM                     0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE          0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL           0x20
+UMASK_HITME_LOOKUP_INVALS                  0x26
+UMASK_HITME_LOOKUP_RSPFWDS                 0x40
+UMASK_HITME_LOOKUP_ALLOCS                  0x70
+UMASK_HITME_LOOKUP_RSP                     0x80
+UMASK_HITME_LOOKUP_ALL                     0xFF
+
+EVENT_HITME_HIT                         0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE         0x01
+UMASK_HITME_HIT_WBMTOI                  0x02
+UMASK_HITME_HIT_ACKCNFLTWBI             0x04
+UMASK_HITME_HIT_WBMTOE_OR_S             0x08
+UMASK_HITME_HIT_HOM                     0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE          0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL           0x20
+UMASK_HITME_HIT_INVALS                  0x26
+UMASK_HITME_HIT_RSPFWDS                 0x40
+UMASK_HITME_HIT_EVICTS                  0x42
+UMASK_HITME_HIT_ALLOCS                  0x70
+UMASK_HITME_HIT_RSP                     0x80
+UMASK_HITME_HIT_ALL                     0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET             0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI          0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI     0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S     0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM             0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE  0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL   0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS         0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP             0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL             0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES              0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0      0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1      0x02
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2      0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0      0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1      0x08
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2      0x20
+
+EVENT_IMC_READS                         0x17 BBOX
+UMASK_IMC_READS_NORMAL                  0x01
+
+EVENT_IMC_RETRY                         0x1E BBOX
+UMASK_IMC_RETRY                         0x00
+
+EVENT_IMC_WRITES                        0x1A BBOX
+UMASK_IMC_WRITES_FULL                   0x01
+UMASK_IMC_WRITES_PARTIAL                0x02
+UMASK_IMC_WRITES_FULL_ISOCH             0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH          0x08
+UMASK_IMC_WRITES_ALL                    0x0F
+
+EVENT_OSB                               0x53 BBOX
+UMASK_OSB_READS_LOCAL                   0x02
+UMASK_OSB_INVITOE_LOCAL                 0x04
+UMASK_OSB_REMOTE                        0x08
+UMASK_OSB_CANCELLED                     0x10
+UMASK_OSB_READS_LOCAL_USEFUL            0x20
+UMASK_OSB_REMOTE_USEFUL                 0x40
+
+EVENT_OSB_EDR                           0x54 BBOX
+UMASK_OSB_EDR_ALL                       0x01
+UMASK_OSB_EDR_READS_LOCAL_I             0x02
+UMASK_OSB_EDR_READS_REMOTE_I            0x04
+UMASK_OSB_EDR_READS_LOCAL_S             0x08
+UMASK_OSB_EDR_READS_REMOTE_S            0x10
+
+EVENT_REQUESTS                          0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL              0x01
+UMASK_REQUESTS_READS_REMOTE             0x02
+UMASK_REQUESTS_READS                    0x03
+UMASK_REQUESTS_WRITES_LOCAL             0x04
+UMASK_REQUESTS_WRITES_REMOTE            0x08
+UMASK_REQUESTS_WRITES                   0x0C
+UMASK_REQUESTS_INVITOE_LOCAL            0x10
+UMASK_REQUESTS_INVITOE_REMOTE           0x20
+
+EVENT_RING_AD_USED                      0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x68 BBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO0_CREDIT_OCCUPANCY             0x6A BBOX
+UMASK_SBO0_CREDIT_OCCUPANCY_AD          0x01
+UMASK_SBO0_CREDIT_OCCUPANCY_BL          0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED             0x69 BBOX
+UMASK_SBO1_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO1_CREDIT_OCCUPANCY             0x6B BBOX
+UMASK_SBO1_CREDIT_OCCUPANCY_AD          0x01
+UMASK_SBO1_CREDIT_OCCUPANCY_BL          0x02
+
+EVENT_SNOOPS_RSP_AFTER_DATA             0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL       0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE      0x02
+
+EVENT_SNOOP_CYCLES_NE                   0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL             0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE            0x02
+UMASK_SNOOP_CYCLES_NE_ALL               0x03
+
+EVENT_SNOOP_OCCUPANCY                   0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL             0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE            0x02
+
+EVENT_SNOOP_RESP                        0x21 BBOX
+UMASK_SNOOP_RESP_RSPI                   0x01
+UMASK_SNOOP_RESP_RSPS                   0x02
+UMASK_SNOOP_RESP_RSPIFWD                0x04
+UMASK_SNOOP_RESP_RSPSFWD                0x08
+UMASK_SNOOP_RESP_RSP_WB                 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB             0x20
+UMASK_SNOOP_RESP_RSPCNFLCT              0x40
+
+EVENT_SNP_RESP_RECV_LOCAL               0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI          0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS          0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD       0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD       0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB        0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB    0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT     0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER         0x80
+
+EVENT_STALL_NO_SBO_CREDIT               0x6C BBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x08
+
+EVENT_TAD_REQUESTS_G0                   0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0           0x01
+UMASK_TAD_REQUESTS_G0_REGION1           0x02
+UMASK_TAD_REQUESTS_G0_REGION2           0x04
+UMASK_TAD_REQUESTS_G0_REGION3           0x08
+UMASK_TAD_REQUESTS_G0_REGION4           0x10
+UMASK_TAD_REQUESTS_G0_REGION5           0x20
+UMASK_TAD_REQUESTS_G0_REGION6           0x40
+UMASK_TAD_REQUESTS_G0_REGION7           0x80
+
+EVENT_TAD_REQUESTS_G1                   0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8           0x01
+UMASK_TAD_REQUESTS_G1_REGION9           0x02
+UMASK_TAD_REQUESTS_G1_REGION10          0x04
+UMASK_TAD_REQUESTS_G1_REGION11          0x08
+
+EVENT_TRACKER_CYCLES_FULL               0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP            0x01
+UMASK_TRACKER_CYCLES_FULL_ALL           0x02
+
+EVENT_TRACKER_CYCLES_NE                 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL           0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE          0x02
+UMASK_TRACKER_CYCLES_NE_ALL             0x03
+
+EVENT_TRACKER_OCCUPANCY                 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL     0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE    0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL    0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE   0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL   0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE  0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY         0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL   0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE  0x02
+
+EVENT_TXR_AD_CYCLES_FULL                0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_AK                            0x0E BBOX
+UMASK_TXR_AK                            0x00
+
+EVENT_TXR_AK_CYCLES_FULL                0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL                            0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE                  0x01
+UMASK_TXR_BL_DRS_CORE                   0x02
+UMASK_TXR_BL_DRS_QPI                    0x04
+
+EVENT_TXR_BL_CYCLES_FULL                0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL_OCCUPANCY                  0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY                  0x00
+
+EVENT_TXR_STARVED                       0x6D BBOX
+UMASK_TXR_STARVED_AK                    0x01
+UMASK_TXR_STARVED_BL                    0x02
+
+EVENT_DRAM_CLOCKTICKS                   0x00 MBOX
+UMASK_DRAM_CLOCKTICKS                   0x00
+
+EVENT_ACT_COUNT                         0x01 MBOX
+UMASK_ACT_COUNT_RD                      0x01
+UMASK_ACT_COUNT_WR                      0x02
+UMASK_ACT_COUNT_BYP                     0x08
+
+EVENT_BYP_CMDS                          0xA1 MBOX
+UMASK_BYP_CMDS_ACT                      0x01
+UMASK_BYP_CMDS_CAS                      0x02
+UMASK_BYP_CMDS_PRE                      0x04
+
+EVENT_CAS_COUNT                         0x04 MBOX
+UMASK_CAS_COUNT_RD_REG                  0x01
+UMASK_CAS_COUNT_RD_UNDERFILL            0x02
+UMASK_CAS_COUNT_RD                      0x03
+UMASK_CAS_COUNT_RD_WMM                  0x10
+UMASK_CAS_COUNT_RD_RMM                  0x20
+UMASK_CAS_COUNT_WR_WMM                  0x04
+UMASK_CAS_COUNT_WR_RMM                  0x08
+UMASK_CAS_COUNT_WR                      0x0C
+UMASK_CAS_COUNT_ALL                     0x0F
+
+EVENT_DRAM_PRE_ALL                      0x06 MBOX
+UMASK_DRAM_PRE_ALL                      0x00
+
+EVENT_DRAM_REFRESH                      0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC                0x02
+UMASK_DRAM_REFRESH_HIGH                 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS            0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS            0x00
+
+EVENT_MAJOR_MODES                       0x07 MBOX
+UMASK_MAJOR_MODES_READ                  0x01
+UMASK_MAJOR_MODES_WRITE                 0x02
+UMASK_MAJOR_MODES_PARTIAL               0x03
+UMASK_MAJOR_MODES_ISOCH                 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF              0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF              0x00
+
+EVENT_POWER_CHANNEL_PPD                 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD                 0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES    0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES    0x00
+
+EVENT_POWER_PCU_THROTTLING              0x42 MBOX
+UMASK_POWER_PCU_THROTTLING              0x00
+
+EVENT_POWER_SELF_REFRESH                0x43 MBOX
+UMASK_POWER_SELF_REFRESH                0x00
+
+EVENT_POWER_THROTTLE_CYCLES             0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0       0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1       0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2       0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3       0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4       0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5       0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6       0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7       0x80
+
+EVENT_PREEMPTION                        0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD          0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR          0x02
+
+EVENT_PRE_COUNT                         0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS               0x01
+UMASK_PRE_COUNT_PAGE_CLOSE              0x02
+UMASK_PRE_COUNT_RD                      0x04
+UMASK_PRE_COUNT_WR                      0x08
+UMASK_PRE_COUNT_BYP                     0x10
+
+EVENT_RD_CAS_PRIO                       0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW                   0x01
+UMASK_RD_CAS_PRIO_MED                   0x02
+UMASK_RD_CAS_PRIO_HIGH                  0x04
+UMASK_RD_CAS_PRIO_PANIC                 0x08
+
+EVENT_RD_CAS_RANK0                      0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0                0x00
+UMASK_RD_CAS_RANK0_BANK1                0x01
+UMASK_RD_CAS_RANK0_BANK2                0x02
+UMASK_RD_CAS_RANK0_BANK3                0x03
+UMASK_RD_CAS_RANK0_BANK4                0x04
+UMASK_RD_CAS_RANK0_BANK5                0x05
+UMASK_RD_CAS_RANK0_BANK6                0x06
+UMASK_RD_CAS_RANK0_BANK7                0x07
+UMASK_RD_CAS_RANK0_BANK8                0x08
+UMASK_RD_CAS_RANK0_BANK9                0x09
+UMASK_RD_CAS_RANK0_BANK10               0x0A
+UMASK_RD_CAS_RANK0_BANK11               0x0B
+UMASK_RD_CAS_RANK0_BANK12               0x0C
+UMASK_RD_CAS_RANK0_BANK13               0x0D
+UMASK_RD_CAS_RANK0_BANK14               0x0E
+UMASK_RD_CAS_RANK0_BANK15               0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS             0x10
+UMASK_RD_CAS_RANK0_BANKG0               0x11
+UMASK_RD_CAS_RANK0_BANKG1               0x12
+UMASK_RD_CAS_RANK0_BANKG2               0x13
+UMASK_RD_CAS_RANK0_BANKG3               0x14
+
+EVENT_RD_CAS_RANK1                      0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0                0x00
+UMASK_RD_CAS_RANK1_BANK1                0x01
+UMASK_RD_CAS_RANK1_BANK2                0x02
+UMASK_RD_CAS_RANK1_BANK3                0x03
+UMASK_RD_CAS_RANK1_BANK4                0x04
+UMASK_RD_CAS_RANK1_BANK5                0x05
+UMASK_RD_CAS_RANK1_BANK6                0x06
+UMASK_RD_CAS_RANK1_BANK7                0x07
+UMASK_RD_CAS_RANK1_BANK8                0x08
+UMASK_RD_CAS_RANK1_BANK9                0x09
+UMASK_RD_CAS_RANK1_BANK10               0x0A
+UMASK_RD_CAS_RANK1_BANK11               0x0B
+UMASK_RD_CAS_RANK1_BANK12               0x0C
+UMASK_RD_CAS_RANK1_BANK13               0x0D
+UMASK_RD_CAS_RANK1_BANK14               0x0E
+UMASK_RD_CAS_RANK1_BANK15               0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS             0x10
+UMASK_RD_CAS_RANK1_BANKG0               0x11
+UMASK_RD_CAS_RANK1_BANKG1               0x12
+UMASK_RD_CAS_RANK1_BANKG2               0x13
+UMASK_RD_CAS_RANK1_BANKG3               0x14
+
+EVENT_RD_CAS_RANK2                      0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0                0x00
+UMASK_RD_CAS_RANK2_BANK1                0x01
+UMASK_RD_CAS_RANK2_BANK2                0x02
+UMASK_RD_CAS_RANK2_BANK3                0x03
+UMASK_RD_CAS_RANK2_BANK4                0x04
+UMASK_RD_CAS_RANK2_BANK5                0x05
+UMASK_RD_CAS_RANK2_BANK6                0x06
+UMASK_RD_CAS_RANK2_BANK7                0x07
+UMASK_RD_CAS_RANK2_BANK8                0x08
+UMASK_RD_CAS_RANK2_BANK9                0x09
+UMASK_RD_CAS_RANK2_BANK10               0x0A
+UMASK_RD_CAS_RANK2_BANK11               0x0B
+UMASK_RD_CAS_RANK2_BANK12               0x0C
+UMASK_RD_CAS_RANK2_BANK13               0x0D
+UMASK_RD_CAS_RANK2_BANK14               0x0E
+UMASK_RD_CAS_RANK2_BANK15               0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS             0x10
+UMASK_RD_CAS_RANK2_BANKG0               0x11
+UMASK_RD_CAS_RANK2_BANKG1               0x12
+UMASK_RD_CAS_RANK2_BANKG2               0x13
+UMASK_RD_CAS_RANK2_BANKG3               0x14
+
+EVENT_RD_CAS_RANK3                      0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0                0x00
+UMASK_RD_CAS_RANK3_BANK1                0x01
+UMASK_RD_CAS_RANK3_BANK2                0x02
+UMASK_RD_CAS_RANK3_BANK3                0x03
+UMASK_RD_CAS_RANK3_BANK4                0x04
+UMASK_RD_CAS_RANK3_BANK5                0x05
+UMASK_RD_CAS_RANK3_BANK6                0x06
+UMASK_RD_CAS_RANK3_BANK7                0x07
+UMASK_RD_CAS_RANK3_BANK8                0x08
+UMASK_RD_CAS_RANK3_BANK9                0x09
+UMASK_RD_CAS_RANK3_BANK10               0x0A
+UMASK_RD_CAS_RANK3_BANK11               0x0B
+UMASK_RD_CAS_RANK3_BANK12               0x0C
+UMASK_RD_CAS_RANK3_BANK13               0x0D
+UMASK_RD_CAS_RANK3_BANK14               0x0E
+UMASK_RD_CAS_RANK3_BANK15               0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS             0x10
+UMASK_RD_CAS_RANK3_BANKG0               0x11
+UMASK_RD_CAS_RANK3_BANKG1               0x12
+UMASK_RD_CAS_RANK3_BANKG2               0x13
+UMASK_RD_CAS_RANK3_BANKG3               0x14
+
+EVENT_RD_CAS_RANK4                      0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0                0x00
+UMASK_RD_CAS_RANK4_BANK1                0x01
+UMASK_RD_CAS_RANK4_BANK2                0x02
+UMASK_RD_CAS_RANK4_BANK3                0x03
+UMASK_RD_CAS_RANK4_BANK4                0x04
+UMASK_RD_CAS_RANK4_BANK5                0x05
+UMASK_RD_CAS_RANK4_BANK6                0x06
+UMASK_RD_CAS_RANK4_BANK7                0x07
+UMASK_RD_CAS_RANK4_BANK8                0x08
+UMASK_RD_CAS_RANK4_BANK9                0x09
+UMASK_RD_CAS_RANK4_BANK10               0x0A
+UMASK_RD_CAS_RANK4_BANK11               0x0B
+UMASK_RD_CAS_RANK4_BANK12               0x0C
+UMASK_RD_CAS_RANK4_BANK13               0x0D
+UMASK_RD_CAS_RANK4_BANK14               0x0E
+UMASK_RD_CAS_RANK4_BANK15               0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS             0x10
+UMASK_RD_CAS_RANK4_BANKG0               0x11
+UMASK_RD_CAS_RANK4_BANKG1               0x12
+UMASK_RD_CAS_RANK4_BANKG2               0x13
+UMASK_RD_CAS_RANK4_BANKG3               0x14
+
+EVENT_RD_CAS_RANK5                      0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0                0x00
+UMASK_RD_CAS_RANK5_BANK1                0x01
+UMASK_RD_CAS_RANK5_BANK2                0x02
+UMASK_RD_CAS_RANK5_BANK3                0x03
+UMASK_RD_CAS_RANK5_BANK4                0x04
+UMASK_RD_CAS_RANK5_BANK5                0x05
+UMASK_RD_CAS_RANK5_BANK6                0x06
+UMASK_RD_CAS_RANK5_BANK7                0x07
+UMASK_RD_CAS_RANK5_BANK8                0x08
+UMASK_RD_CAS_RANK5_BANK9                0x09
+UMASK_RD_CAS_RANK5_BANK10               0x0A
+UMASK_RD_CAS_RANK5_BANK11               0x0B
+UMASK_RD_CAS_RANK5_BANK12               0x0C
+UMASK_RD_CAS_RANK5_BANK13               0x0D
+UMASK_RD_CAS_RANK5_BANK14               0x0E
+UMASK_RD_CAS_RANK5_BANK15               0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS             0x10
+UMASK_RD_CAS_RANK5_BANKG0               0x11
+UMASK_RD_CAS_RANK5_BANKG1               0x12
+UMASK_RD_CAS_RANK5_BANKG2               0x13
+UMASK_RD_CAS_RANK5_BANKG3               0x14
+
+EVENT_RD_CAS_RANK6                      0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0                0x00
+UMASK_RD_CAS_RANK6_BANK1                0x01
+UMASK_RD_CAS_RANK6_BANK2                0x02
+UMASK_RD_CAS_RANK6_BANK3                0x03
+UMASK_RD_CAS_RANK6_BANK4                0x04
+UMASK_RD_CAS_RANK6_BANK5                0x05
+UMASK_RD_CAS_RANK6_BANK6                0x06
+UMASK_RD_CAS_RANK6_BANK7                0x07
+UMASK_RD_CAS_RANK6_BANK8                0x08
+UMASK_RD_CAS_RANK6_BANK9                0x09
+UMASK_RD_CAS_RANK6_BANK10               0x0A
+UMASK_RD_CAS_RANK6_BANK11               0x0B
+UMASK_RD_CAS_RANK6_BANK12               0x0C
+UMASK_RD_CAS_RANK6_BANK13               0x0D
+UMASK_RD_CAS_RANK6_BANK14               0x0E
+UMASK_RD_CAS_RANK6_BANK15               0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS             0x10
+UMASK_RD_CAS_RANK6_BANKG0               0x11
+UMASK_RD_CAS_RANK6_BANKG1               0x12
+UMASK_RD_CAS_RANK6_BANKG2               0x13
+UMASK_RD_CAS_RANK6_BANKG3               0x14
+
+EVENT_RD_CAS_RANK7                      0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0                0x00
+UMASK_RD_CAS_RANK7_BANK1                0x01
+UMASK_RD_CAS_RANK7_BANK2                0x02
+UMASK_RD_CAS_RANK7_BANK3                0x03
+UMASK_RD_CAS_RANK7_BANK4                0x04
+UMASK_RD_CAS_RANK7_BANK5                0x05
+UMASK_RD_CAS_RANK7_BANK6                0x06
+UMASK_RD_CAS_RANK7_BANK7                0x07
+UMASK_RD_CAS_RANK7_BANK8                0x08
+UMASK_RD_CAS_RANK7_BANK9                0x09
+UMASK_RD_CAS_RANK7_BANK10               0x0A
+UMASK_RD_CAS_RANK7_BANK11               0x0B
+UMASK_RD_CAS_RANK7_BANK12               0x0C
+UMASK_RD_CAS_RANK7_BANK13               0x0D
+UMASK_RD_CAS_RANK7_BANK14               0x0E
+UMASK_RD_CAS_RANK7_BANK15               0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS             0x10
+UMASK_RD_CAS_RANK7_BANKG0               0x11
+UMASK_RD_CAS_RANK7_BANKG1               0x12
+UMASK_RD_CAS_RANK7_BANKG2               0x13
+UMASK_RD_CAS_RANK7_BANKG3               0x14
+
+EVENT_RPQ_CYCLES_NE                     0x11 MBOX
+UMASK_RPQ_CYCLES_NE                     0x00
+
+EVENT_RPQ_INSERTS                       0x10 MBOX
+UMASK_RPQ_INSERTS                       0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY             0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY             0x00
+
+EVENT_VMSE_WR_PUSH                      0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM                  0x01
+UMASK_VMSE_WR_PUSH_RMM                  0x02
+
+EVENT_WMM_TO_RMM                        0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH             0x01
+UMASK_WMM_TO_RMM_STARVE                 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY             0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS                       0x20 MBOX
+UMASK_WPQ_INSERTS                       0x00
+
+EVENT_WPQ_CYCLES_FULL                   0x22 MBOX
+UMASK_WPQ_CYCLES_FULL                   0x00
+
+EVENT_WPQ_CYCLES_NE                     0x21 MBOX
+UMASK_WPQ_CYCLES_NE                     0x00
+
+EVENT_WPQ_READ_HIT                      0x23 MBOX
+UMASK_WPQ_READ_HIT                      0x00
+
+EVENT_WPQ_WRITE_HIT                     0x24 MBOX
+UMASK_WPQ_WRITE_HIT                     0x00
+
+EVENT_WRONG_MM                          0xC1 MBOX
+UMASK_WRONG_MM                          0x00
+
+EVENT_WR_CAS_RANK0                      0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0                0x00
+UMASK_WR_CAS_RANK0_BANK1                0x01
+UMASK_WR_CAS_RANK0_BANK2                0x02
+UMASK_WR_CAS_RANK0_BANK3                0x03
+UMASK_WR_CAS_RANK0_BANK4                0x04
+UMASK_WR_CAS_RANK0_BANK5                0x05
+UMASK_WR_CAS_RANK0_BANK6                0x06
+UMASK_WR_CAS_RANK0_BANK7                0x07
+UMASK_WR_CAS_RANK0_BANK8                0x08
+UMASK_WR_CAS_RANK0_BANK9                0x09
+UMASK_WR_CAS_RANK0_BANK10               0x0A
+UMASK_WR_CAS_RANK0_BANK11               0x0B
+UMASK_WR_CAS_RANK0_BANK12               0x0C
+UMASK_WR_CAS_RANK0_BANK13               0x0D
+UMASK_WR_CAS_RANK0_BANK14               0x0E
+UMASK_WR_CAS_RANK0_BANK15               0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS             0x10
+UMASK_WR_CAS_RANK0_BANKG0               0x11
+UMASK_WR_CAS_RANK0_BANKG1               0x12
+UMASK_WR_CAS_RANK0_BANKG2               0x13
+UMASK_WR_CAS_RANK0_BANKG3               0x14
+
+EVENT_WR_CAS_RANK1                      0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0                0x00
+UMASK_WR_CAS_RANK1_BANK1                0x01
+UMASK_WR_CAS_RANK1_BANK2                0x02
+UMASK_WR_CAS_RANK1_BANK3                0x03
+UMASK_WR_CAS_RANK1_BANK4                0x04
+UMASK_WR_CAS_RANK1_BANK5                0x05
+UMASK_WR_CAS_RANK1_BANK6                0x06
+UMASK_WR_CAS_RANK1_BANK7                0x07
+UMASK_WR_CAS_RANK1_BANK8                0x08
+UMASK_WR_CAS_RANK1_BANK9                0x09
+UMASK_WR_CAS_RANK1_BANK10               0x0A
+UMASK_WR_CAS_RANK1_BANK11               0x0B
+UMASK_WR_CAS_RANK1_BANK12               0x0C
+UMASK_WR_CAS_RANK1_BANK13               0x0D
+UMASK_WR_CAS_RANK1_BANK14               0x0E
+UMASK_WR_CAS_RANK1_BANK15               0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS             0x10
+UMASK_WR_CAS_RANK1_BANKG0               0x11
+UMASK_WR_CAS_RANK1_BANKG1               0x12
+UMASK_WR_CAS_RANK1_BANKG2               0x13
+UMASK_WR_CAS_RANK1_BANKG3               0x14
+
+EVENT_WR_CAS_RANK2                      0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0                0x00
+UMASK_WR_CAS_RANK2_BANK1                0x01
+UMASK_WR_CAS_RANK2_BANK2                0x02
+UMASK_WR_CAS_RANK2_BANK3                0x03
+UMASK_WR_CAS_RANK2_BANK4                0x04
+UMASK_WR_CAS_RANK2_BANK5                0x05
+UMASK_WR_CAS_RANK2_BANK6                0x06
+UMASK_WR_CAS_RANK2_BANK7                0x07
+UMASK_WR_CAS_RANK2_BANK8                0x08
+UMASK_WR_CAS_RANK2_BANK9                0x09
+UMASK_WR_CAS_RANK2_BANK10               0x0A
+UMASK_WR_CAS_RANK2_BANK11               0x0B
+UMASK_WR_CAS_RANK2_BANK12               0x0C
+UMASK_WR_CAS_RANK2_BANK13               0x0D
+UMASK_WR_CAS_RANK2_BANK14               0x0E
+UMASK_WR_CAS_RANK2_BANK15               0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS             0x10
+UMASK_WR_CAS_RANK2_BANKG0               0x11
+UMASK_WR_CAS_RANK2_BANKG1               0x12
+UMASK_WR_CAS_RANK2_BANKG2               0x13
+UMASK_WR_CAS_RANK2_BANKG3               0x14
+
+EVENT_WR_CAS_RANK3                      0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0                0x00
+UMASK_WR_CAS_RANK3_BANK1                0x01
+UMASK_WR_CAS_RANK3_BANK2                0x02
+UMASK_WR_CAS_RANK3_BANK3                0x03
+UMASK_WR_CAS_RANK3_BANK4                0x04
+UMASK_WR_CAS_RANK3_BANK5                0x05
+UMASK_WR_CAS_RANK3_BANK6                0x06
+UMASK_WR_CAS_RANK3_BANK7                0x07
+UMASK_WR_CAS_RANK3_BANK8                0x08
+UMASK_WR_CAS_RANK3_BANK9                0x09
+UMASK_WR_CAS_RANK3_BANK10               0x0A
+UMASK_WR_CAS_RANK3_BANK11               0x0B
+UMASK_WR_CAS_RANK3_BANK12               0x0C
+UMASK_WR_CAS_RANK3_BANK13               0x0D
+UMASK_WR_CAS_RANK3_BANK14               0x0E
+UMASK_WR_CAS_RANK3_BANK15               0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS             0x10
+UMASK_WR_CAS_RANK3_BANKG0               0x11
+UMASK_WR_CAS_RANK3_BANKG1               0x12
+UMASK_WR_CAS_RANK3_BANKG2               0x13
+UMASK_WR_CAS_RANK3_BANKG3               0x14
+
+EVENT_WR_CAS_RANK4                      0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0                0x00
+UMASK_WR_CAS_RANK4_BANK1                0x01
+UMASK_WR_CAS_RANK4_BANK2                0x02
+UMASK_WR_CAS_RANK4_BANK3                0x03
+UMASK_WR_CAS_RANK4_BANK4                0x04
+UMASK_WR_CAS_RANK4_BANK5                0x05
+UMASK_WR_CAS_RANK4_BANK6                0x06
+UMASK_WR_CAS_RANK4_BANK7                0x07
+UMASK_WR_CAS_RANK4_BANK8                0x08
+UMASK_WR_CAS_RANK4_BANK9                0x09
+UMASK_WR_CAS_RANK4_BANK10               0x0A
+UMASK_WR_CAS_RANK4_BANK11               0x0B
+UMASK_WR_CAS_RANK4_BANK12               0x0C
+UMASK_WR_CAS_RANK4_BANK13               0x0D
+UMASK_WR_CAS_RANK4_BANK14               0x0E
+UMASK_WR_CAS_RANK4_BANK15               0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS             0x10
+UMASK_WR_CAS_RANK4_BANKG0               0x11
+UMASK_WR_CAS_RANK4_BANKG1               0x12
+UMASK_WR_CAS_RANK4_BANKG2               0x13
+UMASK_WR_CAS_RANK4_BANKG3               0x14
+
+EVENT_WR_CAS_RANK5                      0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0                0x00
+UMASK_WR_CAS_RANK5_BANK1                0x01
+UMASK_WR_CAS_RANK5_BANK2                0x02
+UMASK_WR_CAS_RANK5_BANK3                0x03
+UMASK_WR_CAS_RANK5_BANK4                0x04
+UMASK_WR_CAS_RANK5_BANK5                0x05
+UMASK_WR_CAS_RANK5_BANK6                0x06
+UMASK_WR_CAS_RANK5_BANK7                0x07
+UMASK_WR_CAS_RANK5_BANK8                0x08
+UMASK_WR_CAS_RANK5_BANK9                0x09
+UMASK_WR_CAS_RANK5_BANK10               0x0A
+UMASK_WR_CAS_RANK5_BANK11               0x0B
+UMASK_WR_CAS_RANK5_BANK12               0x0C
+UMASK_WR_CAS_RANK5_BANK13               0x0D
+UMASK_WR_CAS_RANK5_BANK14               0x0E
+UMASK_WR_CAS_RANK5_BANK15               0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS             0x10
+UMASK_WR_CAS_RANK5_BANKG0               0x11
+UMASK_WR_CAS_RANK5_BANKG1               0x12
+UMASK_WR_CAS_RANK5_BANKG2               0x13
+UMASK_WR_CAS_RANK5_BANKG3               0x14
+
+EVENT_WR_CAS_RANK6                      0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0                0x00
+UMASK_WR_CAS_RANK6_BANK1                0x01
+UMASK_WR_CAS_RANK6_BANK2                0x02
+UMASK_WR_CAS_RANK6_BANK3                0x03
+UMASK_WR_CAS_RANK6_BANK4                0x04
+UMASK_WR_CAS_RANK6_BANK5                0x05
+UMASK_WR_CAS_RANK6_BANK6                0x06
+UMASK_WR_CAS_RANK6_BANK7                0x07
+UMASK_WR_CAS_RANK6_BANK8                0x08
+UMASK_WR_CAS_RANK6_BANK9                0x09
+UMASK_WR_CAS_RANK6_BANK10               0x0A
+UMASK_WR_CAS_RANK6_BANK11               0x0B
+UMASK_WR_CAS_RANK6_BANK12               0x0C
+UMASK_WR_CAS_RANK6_BANK13               0x0D
+UMASK_WR_CAS_RANK6_BANK14               0x0E
+UMASK_WR_CAS_RANK6_BANK15               0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS             0x10
+UMASK_WR_CAS_RANK6_BANKG0               0x11
+UMASK_WR_CAS_RANK6_BANKG1               0x12
+UMASK_WR_CAS_RANK6_BANKG2               0x13
+UMASK_WR_CAS_RANK6_BANKG3               0x14
+
+EVENT_WR_CAS_RANK7                      0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0                0x00
+UMASK_WR_CAS_RANK7_BANK1                0x01
+UMASK_WR_CAS_RANK7_BANK2                0x02
+UMASK_WR_CAS_RANK7_BANK3                0x03
+UMASK_WR_CAS_RANK7_BANK4                0x04
+UMASK_WR_CAS_RANK7_BANK5                0x05
+UMASK_WR_CAS_RANK7_BANK6                0x06
+UMASK_WR_CAS_RANK7_BANK7                0x07
+UMASK_WR_CAS_RANK7_BANK8                0x08
+UMASK_WR_CAS_RANK7_BANK9                0x09
+UMASK_WR_CAS_RANK7_BANK10               0x0A
+UMASK_WR_CAS_RANK7_BANK11               0x0B
+UMASK_WR_CAS_RANK7_BANK12               0x0C
+UMASK_WR_CAS_RANK7_BANK13               0x0D
+UMASK_WR_CAS_RANK7_BANK14               0x0E
+UMASK_WR_CAS_RANK7_BANK15               0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS             0x10
+UMASK_WR_CAS_RANK7_BANKG0               0x11
+UMASK_WR_CAS_RANK7_BANKG1               0x12
+UMASK_WR_CAS_RANK7_BANKG2               0x13
+UMASK_WR_CAS_RANK7_BANKG3               0x14
+
+EVENT_PBOX_CLOCKTICKS                   0x01 PBOX
+UMASK_PBOX_CLOCKTICKS                   0x00
+
+EVENT_IIO_CREDIT                        0x2D PBOX0|PBOX1
+UMASK_IIO_CREDIT_PRQ_QPI0               0x01
+UMASK_IIO_CREDIT_PRQ_QPI1               0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0             0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1             0x08
+
+EVENT_RING_AD_USED                      0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_BOUNCES                   0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP                0x01
+UMASK_RING_AK_BOUNCES_DN                0x02
+
+EVENT_RING_AK_USED                      0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RING_IV_USED                      0x0A PBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RXR_CYCLES_NE                     0x10 PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_NCB                 0x10
+UMASK_RXR_CYCLES_NE_NCS                 0x20
+
+EVENT_RXR_INSERTS                       0x11 PBOX0|PBOX1
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_OCCUPANCY                     0x13 PBOX0
+UMASK_RXR_OCCUPANCY_DRS                 0x08
+
+EVENT_TXR_CYCLES_FULL                   0x25 PBOX0
+UMASK_TXR_CYCLES_FULL_AD                0x01
+UMASK_TXR_CYCLES_FULL_AK                0x02
+UMASK_TXR_CYCLES_FULL_BL                0x04
+
+EVENT_TXR_CYCLES_NE                     0x23 PBOX0
+UMASK_TXR_CYCLES_NE_AD                  0x01
+UMASK_TXR_CYCLES_NE_AK                  0x02
+UMASK_TXR_CYCLES_NE_BL                  0x04
+
+EVENT_TXR_NACK_CW                       0x26 PBOX0|PBOX1
+UMASK_TXR_NACK_CW_DN_AD                 0x01
+UMASK_TXR_NACK_CW_DN_BL                 0x02
+UMASK_TXR_NACK_CW_DN_AK                 0x04
+UMASK_TXR_NACK_CW_UP_AD                 0x08
+UMASK_TXR_NACK_CW_UP_BL                 0x10
+UMASK_TXR_NACK_CW_UP_AK                 0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x28 PBOX0|PBOX1
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_STALL_NO_SBO_CREDIT               0x2C PBOX0|PBOX1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x08
+
+EVENT_CACHE_TOTAL_OCCUPANCY             0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY         0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE      0x02
+
+EVENT_COHERENT_OPS                      0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR             0x01
+UMASK_COHERENT_OPS_CRD                  0x02
+UMASK_COHERENT_OPS_DRD                  0x04
+UMASK_COHERENT_OPS_RFO                  0x08
+UMASK_COHERENT_OPS_PCITOM               0x10
+UMASK_COHERENT_OPS_PCIDCAHINT           0x20
+UMASK_COHERENT_OPS_WBMTOI               0x40
+UMASK_COHERENT_OPS_CLFLUSH              0x80
+
+EVENT_MISC0                             0x14 IBOX
+UMASK_MISC0_FAST_REQ                    0x01
+UMASK_MISC0_FAST_REJ                    0x02
+UMASK_MISC0_2ND_RD_INSERT               0x04
+UMASK_MISC0_2ND_WR_INSERT               0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT           0x10
+UMASK_MISC0_FAST_XFER                   0x20
+UMASK_MISC0_PF_ACK_HINT                 0x40
+UMASK_MISC0_PF_TIMEOUT                  0x80
+
+EVENT_MISC1                             0x15 IBOX
+UMASK_MISC1_SLOW_I                      0x01
+UMASK_MISC1_SLOW_S                      0x02
+UMASK_MISC1_SLOW_E                      0x04
+UMASK_MISC1_SLOW_M                      0x08
+UMASK_MISC1_LOST_FWD                    0x10
+UMASK_MISC1_SEC_RCVD_INVLD              0x20
+UMASK_MISC1_SEC_RCVD_VLD                0x40
+UMASK_MISC1_DATA_THROTTLE               0x80
+
+EVENT_SNOOP_RESP                        0x17 IBOX
+UMASK_SNOOP_RESP_MISS                   0x01
+UMASK_SNOOP_RESP_HIT_I                  0x02
+UMASK_SNOOP_RESP_HIT_ES                 0x04
+UMASK_SNOOP_RESP_HIT_M                  0x08
+UMASK_SNOOP_RESP_SNPCODE                0x10
+UMASK_SNOOP_RESP_SNPDATA                0x20
+UMASK_SNOOP_RESP_SNPINV                 0x40
+
+EVENT_TRANSACTIONS                      0x16 IBOX
+UMASK_TRANSACTIONS_READS                0x01
+UMASK_TRANSACTIONS_WRITES               0x02
+UMASK_TRANSACTIONS_RD_PREF              0x04
+UMASK_TRANSACTIONS_WR_PREF              0x08
+UMASK_TRANSACTIONS_ALL_READS            0x05
+UMASK_TRANSACTIONS_ALL_WRITES           0x0A
+UMASK_TRANSACTIONS_ATOMIC               0x10
+UMASK_TRANSACTIONS_OTHER                0x20
+UMASK_TRANSACTIONS_ORDERINGQ            0x40
+
+EVENT_RXR_AK_INSERTS                    0x0A IBOX
+UMASK_RXR_AK_INSERTS                    0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL            0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_DRS_INSERTS                0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS                0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY              0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL            0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCB_INSERTS                0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS                0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY              0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL            0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCS_INSERTS                0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS                0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY              0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY              0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES        0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES        0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_DATA_INSERTS_NCB              0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB              0x00
+
+EVENT_TXR_DATA_INSERTS_NCS              0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS              0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY             0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY             0x00
+
+EVENT_RBOX_CLOCKTICK                    0x01 RBOX
+UMASK_RBOX_CLOCKTICK                    0x00
+
+EVENT_C_HI_AD_CREDITS_EMPTY             0x1F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8        0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9        0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10       0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11       0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12       0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13       0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14_16    0x40
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO15_17    0x80
+
+EVENT_C_LO_AD_CREDITS_EMPTY             0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0        0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1        0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2        0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3        0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4        0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5        0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6        0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7        0x80
+
+EVENT_HA_R2_BL_CREDITS_EMPTY            0x2D RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA0        0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA1        0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCB     0x04
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCS     0x08
+
+EVENT_QPI0_AD_CREDITS_EMPTY             0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY             0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY             0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY             0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_RING_AD_USED                      0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RING_IV_USED                      0x0A RBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RING_SINK_STARVED                 0x0E RBOX
+UMASK_RING_SINK_STARVED_AK              0x02
+
+EVENT_RXR_CYCLES_NE                     0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM                 0x01
+UMASK_RXR_CYCLES_NE_SNP                 0x02
+UMASK_RXR_CYCLES_NE_NDR                 0x04
+
+EVENT_RXR_CYCLES_NE_VN1                 0x14 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_VN1_HOM             0x01
+UMASK_RXR_CYCLES_NE_VN1_SNP             0x02
+UMASK_RXR_CYCLES_NE_VN1_NDR             0x04
+UMASK_RXR_CYCLES_NE_VN1_DRS             0x08
+UMASK_RXR_CYCLES_NE_VN1_NCB             0x10
+UMASK_RXR_CYCLES_NE_VN1_NCS             0x20
+
+EVENT_RXR_INSERTS                       0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM                   0x01
+UMASK_RXR_INSERTS_SNP                   0x02
+UMASK_RXR_INSERTS_NDR                   0x04
+UMASK_RXR_INSERTS_DRS                   0x08
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_INSERTS_VN1                       0x15 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_VN1_HOM                   0x01
+UMASK_RXR_INSERTS_VN1_SNP                   0x02
+UMASK_RXR_INSERTS_VN1_NDR                   0x04
+UMASK_RXR_INSERTS_VN1_DRS                   0x08
+UMASK_RXR_INSERTS_VN1_NCB                   0x10
+UMASK_RXR_INSERTS_VN1_NCS                   0x20
+
+EVENT_RXR_OCCUPANCY_VN1                       0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_VN1_HOM                   0x01
+UMASK_RXR_OCCUPANCY_VN1_SNP                   0x02
+UMASK_RXR_OCCUPANCY_VN1_NDR                   0x04
+UMASK_RXR_OCCUPANCY_VN1_DRS                   0x08
+UMASK_RXR_OCCUPANCY_VN1_NCB                   0x10
+UMASK_RXR_OCCUPANCY_VN1_NCS                   0x20
+
+EVENT_TXR_CYCLES_FULL                       0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_FULL                       0x00
+
+EVENT_TXR_CYCLES_NE                       0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_NE                       0x00
+
+EVENT_TXR_NACK                              0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK_DN_AD                        0x01
+UMASK_TXR_NACK_DN_BL                        0x02
+UMASK_TXR_NACK_DN_AK                        0x04
+UMASK_TXR_NACK_UP_AD                        0x08
+UMASK_TXR_NACK_UP_BL                        0x10
+UMASK_TXR_NACK_UP_AK                        0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED                 0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO0_CREDITS_ACQUIRED_AD              0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL              0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED                 0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO1_CREDITS_ACQUIRED_AD              0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL              0x02
+
+EVENT_STALL_NO_SBO_CREDIT                   0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD           0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD           0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL           0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL           0x08
+
+EVENT_VN0_CREDITS_REJECT                    0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM                0x01
+UMASK_VN0_CREDITS_REJECT_SNP                0x02
+UMASK_VN0_CREDITS_REJECT_NDR                0x04
+UMASK_VN0_CREDITS_REJECT_DRS                0x08
+UMASK_VN0_CREDITS_REJECT_NCB                0x10
+UMASK_VN0_CREDITS_REJECT_NCS                0x20
+
+EVENT_VN1_CREDITS_REJECT                    0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_REJECT_HOM                0x01
+UMASK_VN1_CREDITS_REJECT_SNP                0x02
+UMASK_VN1_CREDITS_REJECT_NDR                0x04
+UMASK_VN1_CREDITS_REJECT_DRS                0x08
+UMASK_VN1_CREDITS_REJECT_NCB                0x10
+UMASK_VN1_CREDITS_REJECT_NCS                0x20
+
+EVENT_VNA_CREDITS_REJECT                    0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM                0x01
+UMASK_VNA_CREDITS_REJECT_SNP                0x02
+UMASK_VNA_CREDITS_REJECT_NDR                0x04
+UMASK_VNA_CREDITS_REJECT_DRS                0x08
+UMASK_VNA_CREDITS_REJECT_NCB                0x10
+UMASK_VNA_CREDITS_REJECT_NCS                0x20
+
+EVENT_VN0_CREDITS_USED                    0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM                0x01
+UMASK_VN0_CREDITS_USED_SNP                0x02
+UMASK_VN0_CREDITS_USED_NDR                0x04
+UMASK_VN0_CREDITS_USED_DRS                0x08
+UMASK_VN0_CREDITS_USED_NCB                0x10
+UMASK_VN0_CREDITS_USED_NCS                0x20
+
+EVENT_VN1_CREDITS_USED                    0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_USED_HOM                0x01
+UMASK_VN1_CREDITS_USED_SNP                0x02
+UMASK_VN1_CREDITS_USED_NDR                0x04
+UMASK_VN1_CREDITS_USED_DRS                0x08
+UMASK_VN1_CREDITS_USED_NCB                0x10
+UMASK_VN1_CREDITS_USED_NCS                0x20
+
+EVENT_VNA_CREDITS_ACQUIRED              0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+EVENT_VNA_CREDITS_ACQUIRED_AD           0x01
+EVENT_VNA_CREDITS_ACQUIRED_BL           0x04
+
+EVENT_BOUNCE_CONTROL                0x0A SBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_SBOX_CLOCKTICKS               0x00 SBOX
+UMASK_SBOX_CLOCKTICKS               0x00
+
+EVENT_FAST_ASSERTED                 0x09 SBOX
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_RING_AD_USED                  0x1B SBOX
+UMASK_RING_AD_USED_ANY              0x0F
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+
+EVENT_RING_AK_USED                  0x1C SBOX
+UMASK_RING_AK_USED_ANY              0x0F
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+
+EVENT_RING_BL_USED                  0x1D SBOX
+UMASK_RING_BL_USED_ANY              0x0F
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+
+EVENT_RING_BOUNCES                  0x05 SBOX
+UMASK_RING_BOUNCES_AD_CACHE         0x01
+UMASK_RING_BOUNCES_AK_CORE          0x02
+UMASK_RING_BOUNCES_BL_CORE          0x04
+UMASK_RING_BOUNCES_IV_CORE          0x08
+
+EVENT_RING_IV_USED                  0x1E SBOX
+UMASK_RING_IV_USED_ANY              0x0F
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DOWN             0x0C
+
+EVENT_RXR_BYPASS                    0x12 SBOX
+UMASK_RXR_BYPASS_AD_CRD             0x01
+UMASK_RXR_BYPASS_AD_BNC             0x02
+UMASK_RXR_BYPASS_BL_CRD             0x04
+UMASK_RXR_BYPASS_BL_BNC             0x08
+UMASK_RXR_BYPASS_AK                 0x10
+UMASK_RXR_BYPASS_IV                 0x20
+
+EVENT_RxR_INSERTS                   0x13 SBOX
+UMASK_RXR_INSERTS_AD_CRD            0x01
+UMASK_RXR_INSERTS_AD_BNC            0x02
+UMASK_RXR_INSERTS_BL_CRD            0x04
+UMASK_RXR_INSERTS_BL_BNC            0x08
+UMASK_RXR_INSERTS_AK                0x10
+UMASK_RXR_INSERTS_IV                0x20
+
+EVENT_RXR_OCCUPANCY                 0x11 SBOX
+UMASK_RXR_OCCUPANCY_AD_CRD          0x01
+UMASK_RXR_OCCUPANCY_AD_BNC          0x02
+UMASK_RXR_OCCUPANCY_BL_CRD          0x04
+UMASK_RXR_OCCUPANCY_BL_BNC          0x08
+UMASK_RXR_OCCUPANCY_AK              0x10
+UMASK_RXR_OCCUPANCY_IV              0x20
+
+EVENT_TXR_ADS_USED                  0x04 SBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_TXR_INSERTS                   0x02 SBOX
+UMASK_TXR_INSERTS_AD_CRD            0x01
+UMASK_TXR_INSERTS_AD_BNC            0x02
+UMASK_TXR_INSERTS_BL_CRD            0x04
+UMASK_TXR_INSERTS_BL_BNC            0x08
+UMASK_TXR_INSERTS_AK                0x10
+UMASK_TXR_INSERTS_IV                0x20
+
+EVENT_TXR_OCCUPANCY                 0x01 SBOX
+UMASK_TXR_OCCUPANCY_AD_CRD          0x01
+UMASK_TXR_OCCUPANCY_AD_BNC          0x02
+UMASK_TXR_OCCUPANCY_BL_CRD          0x04
+UMASK_TXR_OCCUPANCY_BL_BNC          0x08
+UMASK_TXR_OCCUPANCY_AK              0x10
+UMASK_TXR_OCCUPANCY_IV              0x20
+
+EVENT_TXR_ORDERING                  0x07 SBOX
+UMASK_TXR_ORDERING_IV_SNOOPGO_UP    0x01
+UMASK_TXR_ORDERING_IV_SNOOPGO_DN    0x02
+UMASK_TXR_ORDERING_AK_U2C_UP_EVEN   0x04
+UMASK_TXR_ORDERING_AK_U2C_UP_ODD    0x08
+UMASK_TXR_ORDERING_AK_U2C_DN_EVEN   0x10
+UMASK_TXR_ORDERING_AK_U2C_DN_ODD    0x20
+
+EVENT_QBOX_CLOCKTICKS                   0x14 QBOX
+UMASK_QBOX_CLOCKTICKS                   0x00
+
+EVENT_CTO_COUNT                         0x38 QBOX
+OPTIONS_CTO_COUNT                       EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MASK2_MASK|EVENT_OPTION_MASK3_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MATCH2_MASK|EVENT_OPTION_MATCH3_MASK
+UMASK_CTO_COUNT                         0x00 0x01 0x00
+
+EVENT_DIRECT2CORE                       0x13 QBOX
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT       0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS       0x02
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT       0x04
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT   0x08
+UMASK_DIRECT2CORE_FAILURE_MISS          0x10
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS  0x20
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS      0x40
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES                   0x12 QBOX
+UMASK_L1_POWER_CYCLES                   0x00
+
+EVENT_RXL0P_POWER_CYCLES                0x10 QBOX
+UMASK_RXL0P_POWER_CYCLES                0x00
+
+EVENT_RXL0_POWER_CYCLES                 0x0F QBOX
+UMASK_RXL0_POWER_CYCLES                 0x00
+
+EVENT_RXL_BYPASSED                      0x09 QBOX
+UMASK_RXL_BYPASSED                      0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0          0x1E QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS      0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB      0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS      0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM      0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP      0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR      0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN1          0x39 QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS      0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB      0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS      0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM      0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP      0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR      0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VNA          0x1D QBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA          0x00 0x01 0x00
+
+EVENT_RXL_CYCLES_NE                     0x0A QBOX
+UMASK_RXL_CYCLES_NE                     0x00
+
+EVENT_RXL_FLITS_G0                      0x01 QBOX
+UMASK_RXL_FLITS_G0_IDLE                 0x01
+UMASK_RXL_FLITS_G0_DATA                 0x02
+UMASK_RXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_RXL_FLITS_G1                      0x02 QBOX
+UMASK_RXL_FLITS_G1_SNP                  0x01 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_REQ              0x02 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_NONREQ           0x04 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM                  0x06 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_DATA             0x08 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_NONDATA          0x10 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS                  0x18 0x01 0x00
+
+EVENT_RXL_FLITS_G2                      0x03 QBOX
+UMASK_RXL_FLITS_G2_NDR_AD               0x01 0x01 0x00
+UMASK_RXL_FLITS_G2_NDR_AK               0x02 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_DATA             0x04 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_NONDATA          0x08 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB                  0x0C 0x01 0x00
+UMASK_RXL_FLITS_G2_NCS                  0x10 0x01 0x00
+
+EVENT_RXL_INSERTS                       0x08 QBOX
+UMASK_RXL_INSERTS                       0x00
+
+EVENT_RXL_INSERTS_DRS                   0x09 QBOX
+UMASK_RXL_INSERTS_DRS_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_DRS_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_HOM                   0x0C QBOX
+UMASK_RXL_INSERTS_HOM_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_HOM_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCB                   0x0A QBOX
+UMASK_RXL_INSERTS_NCB_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCB_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCS                   0x0B QBOX
+UMASK_RXL_INSERTS_NCS_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCS_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NDR                   0x0E QBOX
+UMASK_RXL_INSERTS_NDR_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NDR_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_SNP                   0x0D QBOX
+UMASK_RXL_INSERTS_SNP_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_SNP_VN1               0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY                     0x0B QBOX
+UMASK_RXL_OCCUPANCY                     0x00
+
+EVENT_RXL_OCCUPANCY_DRS                 0x15 QBOX
+UMASK_RXL_OCCUPANCY_DRS_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_DRS_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_HOM                 0x18 QBOX
+UMASK_RXL_OCCUPANCY_HOM_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_HOM_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCB                 0x16 QBOX
+UMASK_RXL_OCCUPANCY_NCB_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCB_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCS                 0x17 QBOX
+UMASK_RXL_OCCUPANCY_NCS_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCS_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NDR                 0x1A QBOX
+UMASK_RXL_OCCUPANCY_NDR_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NDR_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_SNP                 0x19 QBOX
+UMASK_RXL_OCCUPANCY_SNP_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_SNP_VN1             0x02 0x01 0x00
+
+EVENT_TXL0P_POWER_CYCLES                0x0D QBOX
+UMASK_TXL0P_POWER_CYCLES                0x00
+
+EVENT_TXL0_POWER_CYCLES                 0x0C QBOX
+UMASK_TXL0_POWER_CYCLES                 0x00
+
+EVENT_TXL_BYPASSED                      0x05 QBOX
+UMASK_TXL_BYPASSED                      0x00
+
+EVENT_TXL_CYCLES_NE                     0x06 QBOX
+UMASK_TXL_CYCLES_NE                     0x00
+
+EVENT_TXL_FLITS_G0                      0x00 QBOX
+UMASK_TXL_FLITS_G0_IDLE                 0x01
+UMASK_TXL_FLITS_G0_DATA                 0x02
+UMASK_TXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_TXL_FLITS_G1                      0x00 QBOX
+UMASK_TXL_FLITS_G1_SNP                  0x01 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_REQ              0x02 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_NONREQ           0x04 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM                  0x06 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_DATA             0x08 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_NONDATA          0x10 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS                  0x18 0x01 0x00
+
+EVENT_TXL_FLITS_G2                      0x01 QBOX
+UMASK_TXL_FLITS_G2_NDR_AD               0x01 0x01 0x00
+UMASK_TXL_FLITS_G2_NDR_AK               0x02 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_DATA             0x04 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_NONDATA          0x08 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB                  0x0C 0x01 0x00
+UMASK_TXL_FLITS_G2_NCS                  0x10 0x01 0x00
+
+EVENT_TXL_INSERTS                       0x04 QBOX
+UMASK_TXL_INSERTS                       0x00
+
+EVENT_TXL_OCCUPANCY                     0x07 QBOX
+UMASK_TXL_OCCUPANCY                     0x00
+
+EVENT_TXR_AD_HOM_CREDIT_ACQUIRED        0x26 QBOX
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_OCCUPANCY       0x22 QBOX
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_ACQUIRED        0x28 QBOX
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_OCCUPANCY       0x24 QBOX
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_ACQUIRED        0x27 QBOX
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_OCCUPANCY       0x23 QBOX
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_ACQUIRED        0x29 QBOX
+UMASK_TXR_AK_NDR_CREDIT_ACQUIRED        0x00 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_OCCUPANCY       0x25 QBOX
+UMASK_TXR_AK_NDR_CREDIT_OCCUPANCY       0x00 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_ACQUIRED        0x2A QBOX
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_OCCUPANCY       0x1F QBOX
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_ACQUIRED        0x2B QBOX
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_OCCUPANCY       0x20 QBOX
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_ACQUIRED        0x2C QBOX
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_OCCUPANCY       0x21 QBOX
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURNS                0x1C QBOX
+UMASK_VNA_CREDIT_RETURNS                0x00 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY       0x1B QBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY       0x00 0x01 0x00
+
+EVENT_QPI_RATE                          0x00 QBOX0FIX0|QBOX1FIX0
+UMASK_QPI_RATE                          0x00
+
+EVENT_QPI_RX_IDLE                       0x01 QBOX0FIX1|QBOX1FIX1
+UMASK_QPI_RX_IDLE                       0x00
+
+EVENT_QPI_RX_LLR                        0x02 QBOX0FIX2|QBOX1FIX2
+UMASK_QPI_RX_LLR                        0x00
diff --git a/src/includes/perfmon_broadwell_counters.h b/src/includes/perfmon_broadwell_counters.h
new file mode 100644
index 0000000..d5608ba
--- /dev/null
+++ b/src/includes/perfmon_broadwell_counters.h
@@ -0,0 +1,83 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwell_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Broadwell.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_BROADWELL 23
+#define NUM_COUNTERS_CORE_BROADWELL 8
+#define NUM_COUNTERS_UNCORE_BROADWELL 15
+
+#define BDW_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap broadwell_counter_map[NUM_COUNTERS_BROADWELL] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, BDW_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, BDW_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, BDW_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap broadwell_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+    [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
diff --git a/src/includes/perfmon_broadwell_events.txt b/src/includes/perfmon_broadwell_events.txt
new file mode 100644
index 0000000..023bc01
--- /dev/null
+++ b/src/includes/perfmon_broadwell_events.txt
@@ -0,0 +1,665 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_broadwell_events.txt
+#
+#      Description:  Event list for Intel Broadwell
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+
+EVENT_INT_MISC                      0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES      0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT       0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES     0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT      0x08
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE         0x10
+UMASK_UOPS_ISSUED_SLOW_LEA            0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL          0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES         0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE    0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE       0x14  PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+
+EVENT_L2_RQSTS                     0x24   PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT  0x41
+UMASK_L2_RQSTS_RFO_MISS            0x22
+UMASK_L2_RQSTS_RFO_HIT             0x42
+UMASK_L2_RQSTS_CODE_RD_MISS        0x24
+UMASK_L2_RQSTS_CODE_RD_HIT         0x44
+UMASK_L2_RQSTS_L2_PF_HIT           0x50
+UMASK_L2_RQSTS_L2_PF_MISS          0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD  0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS     0x27
+UMASK_L2_RQSTS_ALL_RFO             0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD         0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF              0xF8
+UMASK_L2_RQSTS_MISS                0x3F
+UMASK_L2_RQSTS_REFERENCES          0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT        0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT        0x50
+
+EVENT_LONGEST_LAT_CACHE            0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE  0x4F
+UMASK_LONGEST_LAT_CACHE_MISS       0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK  0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE  0x02
+
+EVENT_L1D_PEND_MISS                  0x48   PMC2
+UMASK_L1D_PEND_MISS_PENDING          0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES   EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES   0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES      0x01
+
+EVENT_DTLB_STORE_MISSES                    0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT              0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION         0x10
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_HW_PF         0x02
+
+EVENT_EPT_WALK_CYCLES            0x4F PMC
+UMASK_EPT_WALK_CYCLES            0x10
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT            0x01
+
+EVENT_TX_MEM                                        0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT                         0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE                   0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK         0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH      0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL                0x40
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES                   0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123           0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS   EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS       0x01
+
+EVENT_TX_EXEC                       0x5D PMC
+EVENT_TX_EXEC_MISC1                 0x01
+EVENT_TX_EXEC_MISC2                 0x02
+EVENT_TX_EXEC_MISC3                 0x04
+EVENT_TX_EXEC_MISC4                 0x08
+EVENT_TX_EXEC_MISC5                 0x10
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_LOCK_CYCLES                             0x63   PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES        EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES                  0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES         EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES                   0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES      EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES                0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR       EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR                 0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES     EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES               0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES          EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES                    0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES        EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES                  0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HIT              0x01
+UMASK_ICACHE_MISSES           0x02
+UMASK_ICACHE_ACCESSES         0x03
+
+EVENT_ITLB_MISSES                   0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK     0x01
+UMASK_ITLB_MISSES_STLB_HIT          0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED    0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K       0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION     0x10
+
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP                       0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL                 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN             0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0x03
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE      0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE      0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE      0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE      0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE      0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE      0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS    EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS     0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY    EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY     0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE  0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS    0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS    EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS     0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY    EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY     0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL    EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL       0x04
+
+EVENT_LSD_UOPS                 0xA8   PMC
+UMASK_LSD_UOPS                 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE        0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS        0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB                       0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P            0xC0  PMC
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_X87              0x02
+
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN     0x20
+
+EVENT_FP_ARITH_INST_RETIRED               0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE      0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE      0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR             0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED             0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE             0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE             0x2A
+
+EVENT_HLE_RETIRED                    0xC8 PMC
+UMASK_HLE_RETIRED_START              0x01
+UMASK_HLE_RETIRED_COMMIT             0x02
+UMASK_HLE_RETIRED_ABORTED            0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1      0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2      0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3      0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4      0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_RTM_RETIRED                    0xC9 PMC
+UMASK_RTM_RETIRED_START              0x01
+UMASK_RTM_RETIRED_COMMIT             0x02
+UMASK_RTM_RETIRED_ABORTED            0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1      0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2      0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3      0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4      0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_FP_ASSIST                      0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT           0x02
+UMASK_FP_ASSIST_X87_INPUT            0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT          0x08
+UMASK_FP_ASSIST_SIMD_INPUT           0x10
+UMASK_FP_ASSIST_ANY                  0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS     0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS     0x20
+
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL        0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL       0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED              0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED           0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT  0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED            0xD3   PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM      0x01
+
+EVENT_BACLEARS                0xE6 PMC
+UMASK_BACLEARS_ANY            0x1F
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PF         0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP                          0x34 CBOX
+UMASK_CACHE_LOOKUP_M                        0x01
+UMASK_CACHE_LOOKUP_E                        0x02
+UMASK_CACHE_LOOKUP_S                        0x04
+UMASK_CACHE_LOOKUP_I                        0x08
+UMASK_CACHE_LOOKUP_READ_FILTER              0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER             0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER            0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER       0x80
+UMASK_CACHE_LOOKUP_READ_M                   0x11
+UMASK_CACHE_LOOKUP_WRITE_M                  0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M                 0x41
+UMASK_CACHE_LOOKUP_ANY_M                    0x81
+UMASK_CACHE_LOOKUP_READ_E                   0x12
+UMASK_CACHE_LOOKUP_WRITE_E                  0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E                 0x42
+UMASK_CACHE_LOOKUP_ANY_E                    0x82
+UMASK_CACHE_LOOKUP_READ_S                   0x14
+UMASK_CACHE_LOOKUP_WRITE_S                  0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S                 0x44
+UMASK_CACHE_LOOKUP_ANY_S                    0x84
+UMASK_CACHE_LOOKUP_READ_ES                  0x16
+UMASK_CACHE_LOOKUP_WRITE_ES                 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES                0x46
+UMASK_CACHE_LOOKUP_ANY_ES                   0x86
+UMASK_CACHE_LOOKUP_READ_I                   0x18
+UMASK_CACHE_LOOKUP_WRITE_I                  0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I                 0x48
+UMASK_CACHE_LOOKUP_ANY_I                    0x88
+UMASK_CACHE_LOOKUP_READ_MESI                0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI               0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI              0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI                 0x8F
+
+EVENT_XSNP_RESPONSE                         0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL           0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE              0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION           0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL            0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE               0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION            0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL           0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE              0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION           0x88
+
+EVENT_TRK_OCCUPANCY_ALL                     0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL                     0x01
+
+EVENT_TRK_REQUESTS                          0x81 UBOX
+UMASK_TRK_REQUESTS_ALL                      0x01
+UMASK_TRK_REQUESTS_WRITES                   0x20
+
+EVENT_COH_TRK_OCCUPANCY                     0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY                     0x01
+
+EVENT_COH_TRK_REQUESTS                      0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL                  0x01
+
+EVENT_UNCORE_CLOCK                          0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                          0x01
+
diff --git a/src/includes/perfmon_broadwelld_counters.h b/src/includes/perfmon_broadwelld_counters.h
new file mode 100644
index 0000000..37f70ad
--- /dev/null
+++ b/src/includes/perfmon_broadwelld_counters.h
@@ -0,0 +1,252 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwellD_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Broadwell D.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#define NUM_COUNTERS_BROADWELLD 141
+#define NUM_COUNTERS_CORE_BROADWELLD 8
+#define NUM_COUNTERS_UNCORE_BROADWELLD 85
+
+#define BDW_D_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_D_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+#define BDW_D_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_STATE_MASK|\
+            EVENT_OPTION_MATCH0_MASK
+#define BDW_D_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define BDW_D_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define BDW_D_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap broadwelld_counter_map[NUM_COUNTERS_BROADWELLD] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_D_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_D_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_D_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX0", PMC12, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0,  0, 0, BDW_D_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC13, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1,  0, 0, BDW_D_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC14, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC16, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC17, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC18, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC20, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC21, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC22, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC24, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC25, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC26, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC28, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC29, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC30, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC32, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC33, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC34, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC36, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC37, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC38, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC40, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC41, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC42, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC44, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC45, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC46, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC48, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC49, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC50, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC52, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC53, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC54, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC56, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC57, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC58, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC60, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC61, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC62, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC64, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC65, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC66, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC68, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC69, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC70, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC72, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC73, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC74, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX15C0", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX15C1", PMC76, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX15C2", PMC77, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"CBOX15C3", PMC78, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_CBOX},
+    {"WBOX0", PMC79, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC80, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC81, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC82, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, BDW_D_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC83, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC84, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"BBOX0C0", PMC85, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC86, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC87, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC88, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC89, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC90, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC91, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC92, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, BDW_D_VALID_OPTIONS_BBOX},
+    {"MBOX0C0", PMC93, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX0C1", PMC94, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX0C2", PMC95, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC96, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_INVERT_MASK},
+    {"MBOX0C3", PMC97, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX1C0", PMC98, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX1C1", PMC99, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX1C2", PMC100, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX1C3", PMC101, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX", PMC102, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_INVERT_MASK},
+    {"MBOX2C0", PMC103, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX2C1", PMC104, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX2C2", PMC105, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX2C3", PMC106, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX", PMC107, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_INVERT_MASK},
+    {"MBOX3C0", PMC108, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX3C1", PMC109, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX3C2", PMC110, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX3C3", PMC111, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX", PMC112, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_INVERT_MASK},
+    {"MBOX4C0", PMC113, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX4C1", PMC114, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX4C2", PMC115, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX4C3", PMC116, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX4FIX", PMC117, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_INVERT_MASK},
+    {"MBOX5C0", PMC118, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX5C1", PMC119, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX5C2", PMC120, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX5C3", PMC121, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX5FIX", PMC122, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_INVERT_MASK},
+    {"MBOX6C0", PMC123, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX6C1", PMC124, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX6C2", PMC125, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX6C3", PMC126, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX6FIX", PMC127, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_INVERT_MASK},
+    {"MBOX7C0", PMC128, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX7C1", PMC129, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX7C2", PMC130, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX7C3", PMC131, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, BDW_D_VALID_OPTIONS_MBOX},
+    {"MBOX7FIX", PMC132, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_INVERT_MASK},
+    {"IBOX0C0", PMC133, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC134, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC135, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC136, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, BDW_D_VALID_OPTIONS_IBOX},
+    {"PBOX0", PMC137, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC138, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC139, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC140, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, BDW_D_VALID_OPTIONS_PBOX},
+};
+
+static BoxMap broadwelld_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+    [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+    [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+    [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+    [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0,0,0,-1,0,0,64},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, -1, 1, PCI_HA_DEVICE_1, 48},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, -1, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+};
+
+static PciDevice broadwelld_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x6F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x6F38},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x6FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x6FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x6FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x6FB1},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x6FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x6FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x6FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x6FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX0", 0x6F39},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x6F34},
+};
diff --git a/src/includes/perfmon_broadwelld_events.txt b/src/includes/perfmon_broadwelld_events.txt
new file mode 100644
index 0000000..88c5add
--- /dev/null
+++ b/src/includes/perfmon_broadwelld_events.txt
@@ -0,0 +1,1984 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_broadwelld_events.txt
+#
+#      Description:  Event list for Intel Broadwell D
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+
+EVENT_INT_MISC                      0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES      0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_COUNT       0x03
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_CYCLES     0x08
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RAT_STALL_COUNT      0x08
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE         0x10
+UMASK_UOPS_ISSUED_SLOW_LEA            0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL          0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES         0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE    0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE       0x14  PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+
+EVENT_L2_RQSTS                     0x24   PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT  0x41
+UMASK_L2_RQSTS_RFO_MISS            0x22
+UMASK_L2_RQSTS_RFO_HIT             0x42
+UMASK_L2_RQSTS_CODE_RD_MISS        0x24
+UMASK_L2_RQSTS_CODE_RD_HIT         0x44
+UMASK_L2_RQSTS_L2_PF_HIT           0x50
+UMASK_L2_RQSTS_L2_PF_MISS          0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD  0xE1
+UMASK_L2_RQSTS_ALL_DEMAND_MISS     0x27
+UMASK_L2_RQSTS_ALL_RFO             0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD         0xE4
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES 0xE7
+UMASK_L2_RQSTS_ALL_PF              0xF8
+UMASK_L2_RQSTS_MISS                0x3F
+UMASK_L2_RQSTS_REFERENCES          0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT        0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT        0x50
+
+EVENT_LONGEST_LAT_CACHE            0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE  0x4F
+UMASK_LONGEST_LAT_CACHE_MISS       0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK  0x01
+UMASK_CPU_CLOCK_UNHALTED_ONE_THREAD_ACTIVE  0x02
+
+EVENT_L1D_PEND_MISS                  0x48   PMC2
+UMASK_L1D_PEND_MISS_PENDING          0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES   EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES   0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES      0x01
+
+EVENT_DTLB_STORE_MISSES                    0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_STORE_MISSES_STLB_HIT              0x60
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION         0x10
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_HW_PF         0x02
+
+EVENT_EPT_WALK_CYCLES            0x4F PMC
+UMASK_EPT_WALK_CYCLES            0x10
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT            0x01
+
+EVENT_TX_MEM                                        0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT                         0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE                   0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK         0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH      0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL                0x40
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES                   0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123           0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS   EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS       0x01
+
+EVENT_TX_EXEC                       0x5D PMC
+EVENT_TX_EXEC_MISC1                 0x01
+EVENT_TX_EXEC_MISC2                 0x02
+EVENT_TX_EXEC_MISC3                 0x04
+EVENT_TX_EXEC_MISC4                 0x08
+EVENT_TX_EXEC_MISC5                 0x10
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD EVENT_OPTION_THRESHOLD=0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_LOCK_CYCLES                             0x63   PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES        EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_CYCLES                  0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES         EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_CYCLES                   0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES      EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_CYCLES                0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR       EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR                 0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES     EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_CYCLES               0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES          EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_CYCLES                    0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES        EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES                  0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HIT              0x01
+UMASK_ICACHE_MISSES           0x02
+UMASK_ICACHE_ACCESSES         0x03
+
+EVENT_ITLB_MISSES                   0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK     0x01
+UMASK_ITLB_MISSES_STLB_HIT          0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED    0x0E
+UMASK_ITLB_MISSES_STLB_HIT_4K       0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION     0x10
+
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP                       0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL                 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN             0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x03
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x02
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0xA0 PMC
+UMASK_UOP_DISPATCHES_CANCELLED_SIMD_PRF   0x03
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE      0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE      0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE      0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE      0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE      0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE      0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0x08
+
+EVENT_CYCLE_ACTIVITY_CYCLES             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS    EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS     0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY    EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY     0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE  0x04
+
+EVENT_CYCLE_ACTIVITY_STALLS_L1D_MISS    0xA3 PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS    EVENT_OPTION_THRESHOLD=0x0C
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_MISS    0x0C
+
+EVENT_CYCLE_ACTIVITY_STALLS             0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS    EVENT_OPTION_THRESHOLD=0x05
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS     0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY    EVENT_OPTION_THRESHOLD=0x06
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY     0x06
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL    EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL       0x04
+
+EVENT_LSD_UOPS                 0xA8   PMC
+UMASK_LSD_UOPS                 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_LSD_CYCLES_ACTIVE        0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_LSD_CYCLES_4_UOPS        0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB                       0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED_ANY_P            0xC0  PMC
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_X87              0x02
+
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x01,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN     0x20
+
+EVENT_FP_ARITH_INST_RETIRED               0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE      0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE      0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+UMASK_FP_ARITH_INST_RETIRED_SCALAR             0x03
+UMASK_FP_ARITH_INST_RETIRED_PACKED             0x3C
+UMASK_FP_ARITH_INST_RETIRED_DOUBLE             0x15
+UMASK_FP_ARITH_INST_RETIRED_SINGLE             0x2A
+
+EVENT_HLE_RETIRED                    0xC8 PMC
+UMASK_HLE_RETIRED_START              0x01
+UMASK_HLE_RETIRED_COMMIT             0x02
+UMASK_HLE_RETIRED_ABORTED            0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1      0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2      0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3      0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4      0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_RTM_RETIRED                    0xC9 PMC
+UMASK_RTM_RETIRED_START              0x01
+UMASK_RTM_RETIRED_COMMIT             0x02
+UMASK_RTM_RETIRED_ABORTED            0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1      0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2      0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3      0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4      0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_FP_ASSIST                      0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT           0x02
+UMASK_FP_ASSIST_X87_INPUT            0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT          0x08
+UMASK_FP_ASSIST_SIMD_INPUT           0x10
+UMASK_FP_ASSIST_ANY                  0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS     0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS     0x20
+
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS_ALL        0x81
+UMASK_MEM_UOPS_RETIRED_STORES_ALL       0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
+R
+EVENT_MEM_LOAD_UOPS_RETIRED              0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED           0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT  0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED            0xD3   PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM 0x01
+
+EVENT_BACLEARS                0xE6 PMC
+UMASK_BACLEARS_ANY            0x1F
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PF         0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_EVENT_MSG                     0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD       0x08
+
+EVENT_PHOLD_CYCLES                  0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK    0x01
+
+EVENT_RACU_REQUESTS                 0x46 UBOX
+UMASK_RACU_REQUESTS                 0x00
+
+EVENT_UNCORE_CLOCK                  0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
+
+EVENT_CBOX_CLOCKTICKS               0x00 CBOX
+UMASK_CBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_INSERTS                   0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE          0x01
+UMASK_TXR_INSERTS_AK_CACHE          0x02
+UMASK_TXR_INSERTS_BL_CACHE          0x04
+UMASK_TXR_INSERTS_IV_CACHE          0x08
+UMASK_TXR_INSERTS_AD_CORE           0x10
+UMASK_TXR_INSERTS_AK_CORE           0x20
+UMASK_TXR_INSERTS_BL_CORE           0x40
+
+EVENT_TXR_ADS_USED                  0x04 CBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_RING_BOUNCES                  0x05 CBOX
+UMASK_RING_BOUNCES_AD               0x01
+UMASK_RING_BOUNCES_AK               0x02
+UMASK_RING_BOUNCES_BL               0x04
+UMASK_RING_BOUNCES_IV               0x10
+
+EVENT_RING_SRC_THRTL                0x07 CBOX
+UMASK_RING_SRC_THRTL                0x00
+
+EVENT_FAST_ASSERTED                 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0x0A CBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RING_AD_USED                  0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+UMASK_RING_AD_USED_ANY              0x0F
+
+EVENT_RING_AK_USED                  0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+UMASK_RING_AK_USED_ANY              0x0F
+
+EVENT_RING_BL_USED                  0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+UMASK_RING_BL_USED_ANY              0x0F
+
+EVENT_RING_IV_USED                  0x1E CBOX
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DN               0x0C
+UMASK_RING_IV_USED_ANY              0x0F
+
+EVENT_COUNTER0_OCCUPANCY            0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY            0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0
+UMASK_RXR_OCCUPANCY_IRQ             0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ         0x02
+UMASK_RXR_OCCUPANCY_IPQ             0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ         0x20
+
+EVENT_RXR_EXT_STARVED               0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                   0x13 CBOX
+UMASK_RXR_INSERTS_IRQ               0x01
+UMASK_RXR_INSERTS_IRQ_REJ           0x02
+UMASK_RXR_INSERTS_IPQ               0x04
+UMASK_RXR_INSERTS_PRQ               0x10
+UMASK_RXR_INSERTS_PRQ_REJ           0x20
+
+EVENT_RXR_IPQ_RETRY                 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY             0x01
+UMASK_RXR_IPQ_RETRY_FULL            0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS     0x10
+
+EVENT_RXR_IPQ_RETRY2                0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO         0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IRQ_RETRY                 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY             0x01
+UMASK_RXR_IRQ_RETRY_FULL            0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IRQ_RETRY_RTID            0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS     0x10
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS     0x20
+OPTIONS_RXR_IRQ_RETRY_NID           EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID             0x40
+
+EVENT_RXR_IRQ_RETRY2                0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_ISMQ_RETRY                0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS    0x20
+OPTIONS_RXR_ISMQ_RETRY_NID          EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID            0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS   EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_RXR_ISMQ_RETRY2                0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET         0x40
+
+EVENT_LLC_LOOKUP                    0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_READ             EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ               0x21
+OPTIONS_LLC_LOOKUP_NID              EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_LLC_VICTIMS                   0x37 CBOX
+UMASK_LLC_VICTIMS_M                 0x01
+UMASK_LLC_VICTIMS_E                 0x02
+UMASK_LLC_VICTIMS_S                 0x04
+UMASK_LLC_VICTIMS_F                 0x08
+UMASK_LLC_VICTIMS_MISS              0x10
+OPTIONS_LLC_VICTIMS_NID             EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID               0x40
+
+EVENT_TOR_INSERTS                   0x35 CBOX
+OPTIONS_TOR_INSERTS_OPCODE          EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE            0x01
+OPTIONS_TOR_INSERTS_MISS_OPCODE     EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE       0x03
+UMASK_TOR_INSERTS_EVICTION          0x04
+UMASK_TOR_INSERTS_ALL               0x08
+UMASK_TOR_INSERTS_WB                0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE    EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE      0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL             0x28
+UMASK_TOR_INSERTS_MISS_LOCAL        0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE      EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE        0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE   0x43
+OPTIONS_TOR_INSERTS_NID_EVICION     EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION       0x44
+OPTIONS_TOR_INSERTS_NID_ALL         EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL           0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL    EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL      0x4A
+OPTIONS_TOR_INSERTS_NID_WB          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB            0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE     0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE            0x88
+UMASK_TOR_INSERTS_MISS_REMOTE       0x8A
+
+EVENT_TOR_OCCUPANCY                 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0
+OPTIONS_TOR_OCCUPANCY_OPCODE        EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE          0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE     0x03
+UMASK_TOR_OCCUPANCY_EVICTION        0x04
+UMASK_TOR_OCCUPANCY_ALL             0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL        0x0A
+UMASK_TOR_OCCUPANCY_WB              0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE    0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL           0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL      0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE    EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE      0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION    0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL       EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL         0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL    0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB        EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB          0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE   0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE          0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE     0x8A
+
+EVENT_MISC                          0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE             0x01
+UMASK_MISC_WC_ALIASING              0x02
+UMASK_MISC_STARTED                  0x04
+UMASK_MISC_RFO_HIT_S                0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM   0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS     0x20
+
+EVENT_WBOX_CLOCKTICKS               0x00 WBOX
+UMASK_WBOX_CLOCKTICKS               0x00
+
+EVENT_CORE0_TRANSITION_CYCLES       0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES       0x00
+
+EVENT_CORE1_TRANSITION_CYCLES       0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES       0x00
+
+EVENT_CORE2_TRANSITION_CYCLES       0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES       0x00
+
+EVENT_CORE3_TRANSITION_CYCLES       0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES       0x00
+
+EVENT_CORE4_TRANSITION_CYCLES       0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES       0x00
+
+EVENT_CORE5_TRANSITION_CYCLES       0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES       0x00
+
+EVENT_CORE6_TRANSITION_CYCLES       0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES       0x00
+
+EVENT_CORE7_TRANSITION_CYCLES       0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES       0x00
+
+EVENT_CORE8_TRANSITION_CYCLES       0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES       0x00
+
+EVENT_CORE9_TRANSITION_CYCLES       0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES       0x00
+
+EVENT_CORE10_TRANSITION_CYCLES       0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES       0x00
+
+EVENT_CORE11_TRANSITION_CYCLES       0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES       0x00
+
+EVENT_CORE12_TRANSITION_CYCLES       0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES       0x00
+
+EVENT_CORE13_TRANSITION_CYCLES       0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES       0x00
+
+EVENT_CORE14_TRANSITION_CYCLES       0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES       0x00
+
+EVENT_CORE15_TRANSITION_CYCLES       0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES       0x00
+
+EVENT_CORE16_TRANSITION_CYCLES       0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES       0x00
+
+EVENT_CORE17_TRANSITION_CYCLES       0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES       0x00
+
+EVENT_FIVR_PS_PS0_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS0_CYCLES             0x00
+
+EVENT_FIVR_PS_PS1_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS1_CYCLES             0x00
+
+EVENT_FIVR_PS_PS2_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS2_CYCLES             0x00
+
+EVENT_FIVR_PS_PS3_CYCLES             0x75 WBOX
+UMASK_FIVR_PS_PS3_CYCLES             0x00
+
+EVENT_DEMOTIONS_CORE0                0x30 WBOX
+UMASK_DEMOTIONS_CORE0                0x00
+
+EVENT_DEMOTIONS_CORE1                0x31 WBOX
+UMASK_DEMOTIONS_CORE1                0x00
+
+EVENT_DEMOTIONS_CORE2                0x32 WBOX
+UMASK_DEMOTIONS_CORE2                0x00
+
+EVENT_DEMOTIONS_CORE3                0x33 WBOX
+UMASK_DEMOTIONS_CORE3                0x00
+
+EVENT_DEMOTIONS_CORE4                0x34 WBOX
+UMASK_DEMOTIONS_CORE4                0x00
+
+EVENT_DEMOTIONS_CORE5                0x35 WBOX
+UMASK_DEMOTIONS_CORE5                0x00
+
+EVENT_DEMOTIONS_CORE6                0x36 WBOX
+UMASK_DEMOTIONS_CORE6                0x00
+
+EVENT_DEMOTIONS_CORE7                0x37 WBOX
+UMASK_DEMOTIONS_CORE7                0x00
+
+EVENT_DEMOTIONS_CORE8                0x38 WBOX
+UMASK_DEMOTIONS_CORE8                0x00
+
+EVENT_DEMOTIONS_CORE9                0x39 WBOX
+UMASK_DEMOTIONS_CORE9                0x00
+
+EVENT_DEMOTIONS_CORE10                0x3A WBOX
+UMASK_DEMOTIONS_CORE10                0x00
+
+EVENT_DEMOTIONS_CORE11                0x3B WBOX
+UMASK_DEMOTIONS_CORE11                0x00
+
+EVENT_DEMOTIONS_CORE12                0x3C WBOX
+UMASK_DEMOTIONS_CORE12                0x00
+
+EVENT_DEMOTIONS_CORE13                0x3D WBOX
+UMASK_DEMOTIONS_CORE13                0x00
+
+EVENT_DEMOTIONS_CORE14                0x3E WBOX
+UMASK_DEMOTIONS_CORE14                0x00
+
+EVENT_DEMOTIONS_CORE15                0x3F WBOX
+UMASK_DEMOTIONS_CORE15                0x00
+
+EVENT_DEMOTIONS_CORE16                0x40 WBOX
+UMASK_DEMOTIONS_CORE16                0x00
+
+EVENT_DEMOTIONS_CORE17                0x41 WBOX
+UMASK_DEMOTIONS_CORE17                0x00
+
+EVENT_FREQ_BAND0_CYCLES                 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES                 0x00
+
+EVENT_FREQ_BAND1_CYCLES                 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES                 0x00
+
+EVENT_FREQ_BAND2_CYCLES                 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES                 0x00
+
+EVENT_FREQ_BAND3_CYCLES                 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES                 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x00
+
+EVENT_FREQ_MAX_OS_CYCLES                0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES                0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES             0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES             0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES              0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES              0x00
+
+EVENT_FREQ_TRANS_CYCLES                 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES                 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES      0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES      0x00
+
+EVENT_POWER_STATE_OCCUPANCY             0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0    0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3    0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6    0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_VR_HOT_CYCLES                     0x42 WBOX
+UMASK_VR_HOT_CYCLES                     0x00
+
+EVENT_UFS_BANDWIDTH_MAX_RANGE           0x7E WBOX
+UMASK_UFS_BANDWIDTH_MAX_RANGE           0x00
+
+EVENT_UFS_TRANSITIONS_DOWN              0x7C WBOX
+UMASK_UFS_TRANSITIONS_DOWN              0x00
+
+EVENT_UFS_TRANSITIONS_IO_P_LIMIT        0x7D WBOX
+UMASK_UFS_TRANSITIONS_IO_P_LIMIT        0x00
+
+EVENT_UFS_TRANSITIONS_NO_CHANGE         0x79 WBOX
+UMASK_UFS_TRANSITIONS_NO_CHANGE         0x00
+
+EVENT_UFS_TRANSITIONS_UP_RING           0x7A WBOX
+UMASK_UFS_TRANSITIONS_UP_RING           0x00
+
+EVENT_UFS_TRANSITIONS_UP_STALL          0x7B WBOX
+UMASK_UFS_TRANSITIONS_UP_STALL          0x00
+
+EVENT_CORES_IN_C3                       0x00 WBOX0FIX
+UMASK_CORES_IN_C3                       0x00
+
+EVENT_CORES_IN_C6                       0x00 WBOX1FIX
+UMASK_CORES_IN_C6                       0x00
+
+EVENT_BBOX_CLOCKTICKS                   0x00 BBOX
+UMASK_BBOX_CLOCKTICKS                   0x00
+
+EVENT_ADDR_OPC_MATCH                    0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR               0x01
+OPTIONS_ADDR_OPC_MATCH_OPC              EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC                0x02
+OPTIONS_ADDR_OPC_MATCH_FILT             EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT               0x03
+OPTIONS_ADDR_OPC_MATCH_AD               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD                 0x04
+OPTIONS_ADDR_OPC_MATCH_BL               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL                 0x08
+OPTIONS_ADDR_OPC_MATCH_AK               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK                 0x10
+
+EVENT_BT_CYCLES_NE                      0x42 BBOX
+UMASK_BT_CYCLES_NE                      0x00
+
+EVENT_BT_OCCUPANCY                      0x43 BBOX
+UMASK_BT_OCCUPANCY                      0x00
+
+EVENT_BYPASS_IMC                        0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN                  0x01
+UMASK_BYPASS_IMC_NOT_TAKEN              0x02
+
+EVENT_CONFLICT_CYCLES                   0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES                   0x00
+
+EVENT_DIRECT2CORE_COUNT                 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT                 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED       0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED       0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE          0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE          0x00
+
+EVENT_DIRECTORY_LAT_OPT                 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT                 0x00
+
+EVENT_DIRECTORY_LOOKUP                  0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP              0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP           0x02
+
+EVENT_DIRECTORY_UPDATE                  0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET              0x01
+UMASK_DIRECTORY_UPDATE_CLEAR            0x02
+UMASK_DIRECTORY_UPDATE_ANY              0x03
+
+EVENT_HITME_LOOKUP                      0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE         0x01
+UMASK_HITME_LOOKUP_WBMTOI                  0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI             0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S             0x08
+UMASK_HITME_LOOKUP_HOM                     0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE          0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL           0x20
+UMASK_HITME_LOOKUP_INVALS                  0x26
+UMASK_HITME_LOOKUP_RSPFWDS                 0x40
+UMASK_HITME_LOOKUP_ALLOCS                  0x70
+UMASK_HITME_LOOKUP_RSP                     0x80
+UMASK_HITME_LOOKUP_ALL                     0xFF
+
+EVENT_HITME_HIT                         0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE         0x01
+UMASK_HITME_HIT_WBMTOI                  0x02
+UMASK_HITME_HIT_ACKCNFLTWBI             0x04
+UMASK_HITME_HIT_WBMTOE_OR_S             0x08
+UMASK_HITME_HIT_HOM                     0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE          0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL           0x20
+UMASK_HITME_HIT_INVALS                  0x26
+UMASK_HITME_HIT_RSPFWDS                 0x40
+UMASK_HITME_HIT_EVICTS                  0x42
+UMASK_HITME_HIT_ALLOCS                  0x70
+UMASK_HITME_HIT_RSP                     0x80
+UMASK_HITME_HIT_ALL                     0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET             0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI          0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI     0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S     0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM             0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE  0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL   0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS         0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP             0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL             0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES              0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0      0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1      0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0      0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1      0x08
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2      0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2      0x20
+
+EVENT_IMC_READS                         0x17 BBOX
+UMASK_IMC_READS_NORMAL                  0x01
+
+EVENT_IMC_RETRY                         0x1E BBOX
+UMASK_IMC_RETRY                         0x00
+
+EVENT_IMC_WRITES                        0x1A BBOX
+UMASK_IMC_WRITES_FULL                   0x01
+UMASK_IMC_WRITES_PARTIAL                0x02
+UMASK_IMC_WRITES_FULL_ISOCH             0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH          0x08
+UMASK_IMC_WRITES_ALL                    0x0F
+
+EVENT_OSB                               0x53 BBOX
+UMASK_OSB_READS_LOCAL                   0x02
+UMASK_OSB_INVITOE_LOCAL                 0x04
+UMASK_OSB_REMOTE                        0x08
+UMASK_OSB_CANCELLED                     0x10
+UMASK_OSB_READS_LOCAL_USEFUL            0x20
+UMASK_OSB_REMOTE_USEFUL                 0x40
+
+EVENT_OSB_EDR                           0x54 BBOX
+UMASK_OSB_EDR_ALL                       0x01
+UMASK_OSB_EDR_READS_LOCAL_I             0x02
+UMASK_OSB_EDR_READS_REMOTE_I            0x04
+UMASK_OSB_EDR_READS_LOCAL_S             0x08
+UMASK_OSB_EDR_READS_REMOTE_S            0x10
+
+EVENT_REQUESTS                          0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL              0x01
+UMASK_REQUESTS_READS_REMOTE             0x02
+UMASK_REQUESTS_READS                    0x03
+UMASK_REQUESTS_WRITES_LOCAL             0x04
+UMASK_REQUESTS_WRITES_REMOTE            0x08
+UMASK_REQUESTS_WRITES                   0x0C
+UMASK_REQUESTS_INVITOE_LOCAL            0x10
+UMASK_REQUESTS_INVITOE_REMOTE           0x20
+
+EVENT_RING_AD_USED                      0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+
+EVENT_SNOOPS_RSP_AFTER_DATA             0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL       0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE      0x02
+
+EVENT_SNOOP_CYCLES_NE                   0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL             0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE            0x02
+UMASK_SNOOP_CYCLES_NE_ALL               0x03
+
+EVENT_SNOOP_OCCUPANCY                   0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL             0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE            0x02
+
+EVENT_SNOOP_RESP                        0x21 BBOX
+UMASK_SNOOP_RESP_RSPI                   0x01
+UMASK_SNOOP_RESP_RSPS                   0x02
+UMASK_SNOOP_RESP_RSPIFWD                0x04
+UMASK_SNOOP_RESP_RSPSFWD                0x08
+UMASK_SNOOP_RESP_RSP_WB                 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB             0x20
+UMASK_SNOOP_RESP_RSPCNFLCT              0x40
+
+EVENT_SNP_RESP_RECV_LOCAL               0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI          0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS          0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD       0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD       0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB        0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB    0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT     0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER         0x80
+
+EVENT_TAD_REQUESTS_G0                   0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0           0x01
+UMASK_TAD_REQUESTS_G0_REGION1           0x02
+UMASK_TAD_REQUESTS_G0_REGION2           0x04
+UMASK_TAD_REQUESTS_G0_REGION3           0x08
+UMASK_TAD_REQUESTS_G0_REGION4           0x10
+UMASK_TAD_REQUESTS_G0_REGION5           0x20
+UMASK_TAD_REQUESTS_G0_REGION6           0x40
+UMASK_TAD_REQUESTS_G0_REGION7           0x80
+
+EVENT_TAD_REQUESTS_G1                   0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8           0x01
+UMASK_TAD_REQUESTS_G1_REGION9           0x02
+UMASK_TAD_REQUESTS_G1_REGION10          0x04
+UMASK_TAD_REQUESTS_G1_REGION11          0x08
+
+EVENT_TRACKER_CYCLES_FULL               0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP            0x01
+UMASK_TRACKER_CYCLES_FULL_ALL           0x02
+
+EVENT_TRACKER_CYCLES_NE                 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL           0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE          0x02
+UMASK_TRACKER_CYCLES_NE_ALL             0x03
+
+EVENT_TRACKER_OCCUPANCY                 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL     0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE    0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL    0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE   0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL   0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE  0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY         0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL   0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE  0x02
+
+EVENT_TXR_AD_CYCLES_FULL                0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_AK                            0x0E BBOX
+UMASK_TXR_AK                            0x00
+
+EVENT_TXR_AK_CYCLES_FULL                0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL                            0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE                  0x01
+UMASK_TXR_BL_DRS_CORE                   0x02
+UMASK_TXR_BL_DRS_QPI                    0x04
+
+EVENT_TXR_BL_CYCLES_FULL                0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL_OCCUPANCY                  0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY                  0x00
+
+EVENT_TXR_STARVED                       0x6D BBOX
+UMASK_TXR_STARVED_AK                    0x01
+UMASK_TXR_STARVED_BL                    0x02
+
+EVENT_DRAM_CLOCKTICKS                   0x00 MBOX
+UMASK_DRAM_CLOCKTICKS                   0x00
+
+EVENT_ACT_COUNT                         0x01 MBOX
+UMASK_ACT_COUNT_RD                      0x01
+UMASK_ACT_COUNT_WR                      0x02
+UMASK_ACT_COUNT_BYP                     0x08
+
+EVENT_BYP_CMDS                          0xA1 MBOX
+UMASK_BYP_CMDS_ACT                      0x01
+UMASK_BYP_CMDS_CAS                      0x02
+UMASK_BYP_CMDS_PRE                      0x04
+
+EVENT_CAS_COUNT                         0x04 MBOX
+UMASK_CAS_COUNT_RD_REG                  0x01
+UMASK_CAS_COUNT_RD_UNDERFILL            0x02
+UMASK_CAS_COUNT_RD                      0x03
+UMASK_CAS_COUNT_RD_WMM                  0x10
+UMASK_CAS_COUNT_RD_RMM                  0x20
+UMASK_CAS_COUNT_WR_WMM                  0x04
+UMASK_CAS_COUNT_WR_RMM                  0x08
+UMASK_CAS_COUNT_WR                      0x0C
+UMASK_CAS_COUNT_ALL                     0x0F
+
+EVENT_DRAM_PRE_ALL                      0x06 MBOX
+UMASK_DRAM_PRE_ALL                      0x00
+
+EVENT_DRAM_REFRESH                      0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC                0x02
+UMASK_DRAM_REFRESH_HIGH                 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS            0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS            0x00
+
+EVENT_MAJOR_MODES                       0x07 MBOX
+UMASK_MAJOR_MODES_READ                  0x01
+UMASK_MAJOR_MODES_WRITE                 0x02
+UMASK_MAJOR_MODES_PARTIAL               0x03
+UMASK_MAJOR_MODES_ISOCH                 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF              0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF              0x00
+
+EVENT_POWER_CHANNEL_PPD                 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD                 0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES    0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES    0x00
+
+EVENT_POWER_PCU_THROTTLING              0x42 MBOX
+UMASK_POWER_PCU_THROTTLING              0x00
+
+EVENT_POWER_SELF_REFRESH                0x43 MBOX
+UMASK_POWER_SELF_REFRESH                0x00
+
+EVENT_POWER_THROTTLE_CYCLES             0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0       0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1       0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2       0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3       0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4       0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5       0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6       0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7       0x80
+
+EVENT_PREEMPTION                        0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD          0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR          0x02
+
+EVENT_PRE_COUNT                         0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS               0x01
+UMASK_PRE_COUNT_PAGE_CLOSE              0x02
+UMASK_PRE_COUNT_RD                      0x04
+UMASK_PRE_COUNT_WR                      0x08
+UMASK_PRE_COUNT_BYP                     0x10
+
+EVENT_RD_CAS_PRIO                       0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW                   0x01
+UMASK_RD_CAS_PRIO_MED                   0x02
+UMASK_RD_CAS_PRIO_HIGH                  0x04
+UMASK_RD_CAS_PRIO_PANIC                 0x08
+
+EVENT_RD_CAS_RANK0                      0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0                0x00
+UMASK_RD_CAS_RANK0_BANK1                0x01
+UMASK_RD_CAS_RANK0_BANK2                0x02
+UMASK_RD_CAS_RANK0_BANK3                0x03
+UMASK_RD_CAS_RANK0_BANK4                0x04
+UMASK_RD_CAS_RANK0_BANK5                0x05
+UMASK_RD_CAS_RANK0_BANK6                0x06
+UMASK_RD_CAS_RANK0_BANK7                0x07
+UMASK_RD_CAS_RANK0_BANK8                0x08
+UMASK_RD_CAS_RANK0_BANK9                0x09
+UMASK_RD_CAS_RANK0_BANK10               0x0A
+UMASK_RD_CAS_RANK0_BANK11               0x0B
+UMASK_RD_CAS_RANK0_BANK12               0x0C
+UMASK_RD_CAS_RANK0_BANK13               0x0D
+UMASK_RD_CAS_RANK0_BANK14               0x0E
+UMASK_RD_CAS_RANK0_BANK15               0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS             0x10
+UMASK_RD_CAS_RANK0_BANKG0               0x11
+UMASK_RD_CAS_RANK0_BANKG1               0x12
+UMASK_RD_CAS_RANK0_BANKG2               0x13
+UMASK_RD_CAS_RANK0_BANKG3               0x14
+
+EVENT_RD_CAS_RANK1                      0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0                0x00
+UMASK_RD_CAS_RANK1_BANK1                0x01
+UMASK_RD_CAS_RANK1_BANK2                0x02
+UMASK_RD_CAS_RANK1_BANK3                0x03
+UMASK_RD_CAS_RANK1_BANK4                0x04
+UMASK_RD_CAS_RANK1_BANK5                0x05
+UMASK_RD_CAS_RANK1_BANK6                0x06
+UMASK_RD_CAS_RANK1_BANK7                0x07
+UMASK_RD_CAS_RANK1_BANK8                0x08
+UMASK_RD_CAS_RANK1_BANK9                0x09
+UMASK_RD_CAS_RANK1_BANK10               0x0A
+UMASK_RD_CAS_RANK1_BANK11               0x0B
+UMASK_RD_CAS_RANK1_BANK12               0x0C
+UMASK_RD_CAS_RANK1_BANK13               0x0D
+UMASK_RD_CAS_RANK1_BANK14               0x0E
+UMASK_RD_CAS_RANK1_BANK15               0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS             0x10
+UMASK_RD_CAS_RANK1_BANKG0               0x11
+UMASK_RD_CAS_RANK1_BANKG1               0x12
+UMASK_RD_CAS_RANK1_BANKG2               0x13
+UMASK_RD_CAS_RANK1_BANKG3               0x14
+
+EVENT_RD_CAS_RANK2                      0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0                0x00
+UMASK_RD_CAS_RANK2_BANK1                0x01
+UMASK_RD_CAS_RANK2_BANK2                0x02
+UMASK_RD_CAS_RANK2_BANK3                0x03
+UMASK_RD_CAS_RANK2_BANK4                0x04
+UMASK_RD_CAS_RANK2_BANK5                0x05
+UMASK_RD_CAS_RANK2_BANK6                0x06
+UMASK_RD_CAS_RANK2_BANK7                0x07
+UMASK_RD_CAS_RANK2_BANK8                0x08
+UMASK_RD_CAS_RANK2_BANK9                0x09
+UMASK_RD_CAS_RANK2_BANK10               0x0A
+UMASK_RD_CAS_RANK2_BANK11               0x0B
+UMASK_RD_CAS_RANK2_BANK12               0x0C
+UMASK_RD_CAS_RANK2_BANK13               0x0D
+UMASK_RD_CAS_RANK2_BANK14               0x0E
+UMASK_RD_CAS_RANK2_BANK15               0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS             0x10
+UMASK_RD_CAS_RANK2_BANKG0               0x11
+UMASK_RD_CAS_RANK2_BANKG1               0x12
+UMASK_RD_CAS_RANK2_BANKG2               0x13
+UMASK_RD_CAS_RANK2_BANKG3               0x14
+
+EVENT_RD_CAS_RANK3                      0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0                0x00
+UMASK_RD_CAS_RANK3_BANK1                0x01
+UMASK_RD_CAS_RANK3_BANK2                0x02
+UMASK_RD_CAS_RANK3_BANK3                0x03
+UMASK_RD_CAS_RANK3_BANK4                0x04
+UMASK_RD_CAS_RANK3_BANK5                0x05
+UMASK_RD_CAS_RANK3_BANK6                0x06
+UMASK_RD_CAS_RANK3_BANK7                0x07
+UMASK_RD_CAS_RANK3_BANK8                0x08
+UMASK_RD_CAS_RANK3_BANK9                0x09
+UMASK_RD_CAS_RANK3_BANK10               0x0A
+UMASK_RD_CAS_RANK3_BANK11               0x0B
+UMASK_RD_CAS_RANK3_BANK12               0x0C
+UMASK_RD_CAS_RANK3_BANK13               0x0D
+UMASK_RD_CAS_RANK3_BANK14               0x0E
+UMASK_RD_CAS_RANK3_BANK15               0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS             0x10
+UMASK_RD_CAS_RANK3_BANKG0               0x11
+UMASK_RD_CAS_RANK3_BANKG1               0x12
+UMASK_RD_CAS_RANK3_BANKG2               0x13
+UMASK_RD_CAS_RANK3_BANKG3               0x14
+
+EVENT_RD_CAS_RANK4                      0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0                0x00
+UMASK_RD_CAS_RANK4_BANK1                0x01
+UMASK_RD_CAS_RANK4_BANK2                0x02
+UMASK_RD_CAS_RANK4_BANK3                0x03
+UMASK_RD_CAS_RANK4_BANK4                0x04
+UMASK_RD_CAS_RANK4_BANK5                0x05
+UMASK_RD_CAS_RANK4_BANK6                0x06
+UMASK_RD_CAS_RANK4_BANK7                0x07
+UMASK_RD_CAS_RANK4_BANK8                0x08
+UMASK_RD_CAS_RANK4_BANK9                0x09
+UMASK_RD_CAS_RANK4_BANK10               0x0A
+UMASK_RD_CAS_RANK4_BANK11               0x0B
+UMASK_RD_CAS_RANK4_BANK12               0x0C
+UMASK_RD_CAS_RANK4_BANK13               0x0D
+UMASK_RD_CAS_RANK4_BANK14               0x0E
+UMASK_RD_CAS_RANK4_BANK15               0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS             0x10
+UMASK_RD_CAS_RANK4_BANKG0               0x11
+UMASK_RD_CAS_RANK4_BANKG1               0x12
+UMASK_RD_CAS_RANK4_BANKG2               0x13
+UMASK_RD_CAS_RANK4_BANKG3               0x14
+
+EVENT_RD_CAS_RANK5                      0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0                0x00
+UMASK_RD_CAS_RANK5_BANK1                0x01
+UMASK_RD_CAS_RANK5_BANK2                0x02
+UMASK_RD_CAS_RANK5_BANK3                0x03
+UMASK_RD_CAS_RANK5_BANK4                0x04
+UMASK_RD_CAS_RANK5_BANK5                0x05
+UMASK_RD_CAS_RANK5_BANK6                0x06
+UMASK_RD_CAS_RANK5_BANK7                0x07
+UMASK_RD_CAS_RANK5_BANK8                0x08
+UMASK_RD_CAS_RANK5_BANK9                0x09
+UMASK_RD_CAS_RANK5_BANK10               0x0A
+UMASK_RD_CAS_RANK5_BANK11               0x0B
+UMASK_RD_CAS_RANK5_BANK12               0x0C
+UMASK_RD_CAS_RANK5_BANK13               0x0D
+UMASK_RD_CAS_RANK5_BANK14               0x0E
+UMASK_RD_CAS_RANK5_BANK15               0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS             0x10
+UMASK_RD_CAS_RANK5_BANKG0               0x11
+UMASK_RD_CAS_RANK5_BANKG1               0x12
+UMASK_RD_CAS_RANK5_BANKG2               0x13
+UMASK_RD_CAS_RANK5_BANKG3               0x14
+
+EVENT_RD_CAS_RANK6                      0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0                0x00
+UMASK_RD_CAS_RANK6_BANK1                0x01
+UMASK_RD_CAS_RANK6_BANK2                0x02
+UMASK_RD_CAS_RANK6_BANK3                0x03
+UMASK_RD_CAS_RANK6_BANK4                0x04
+UMASK_RD_CAS_RANK6_BANK5                0x05
+UMASK_RD_CAS_RANK6_BANK6                0x06
+UMASK_RD_CAS_RANK6_BANK7                0x07
+UMASK_RD_CAS_RANK6_BANK8                0x08
+UMASK_RD_CAS_RANK6_BANK9                0x09
+UMASK_RD_CAS_RANK6_BANK10               0x0A
+UMASK_RD_CAS_RANK6_BANK11               0x0B
+UMASK_RD_CAS_RANK6_BANK12               0x0C
+UMASK_RD_CAS_RANK6_BANK13               0x0D
+UMASK_RD_CAS_RANK6_BANK14               0x0E
+UMASK_RD_CAS_RANK6_BANK15               0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS             0x10
+UMASK_RD_CAS_RANK6_BANKG0               0x11
+UMASK_RD_CAS_RANK6_BANKG1               0x12
+UMASK_RD_CAS_RANK6_BANKG2               0x13
+UMASK_RD_CAS_RANK6_BANKG3               0x14
+
+EVENT_RD_CAS_RANK7                      0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0                0x00
+UMASK_RD_CAS_RANK7_BANK1                0x01
+UMASK_RD_CAS_RANK7_BANK2                0x02
+UMASK_RD_CAS_RANK7_BANK3                0x03
+UMASK_RD_CAS_RANK7_BANK4                0x04
+UMASK_RD_CAS_RANK7_BANK5                0x05
+UMASK_RD_CAS_RANK7_BANK6                0x06
+UMASK_RD_CAS_RANK7_BANK7                0x07
+UMASK_RD_CAS_RANK7_BANK8                0x08
+UMASK_RD_CAS_RANK7_BANK9                0x09
+UMASK_RD_CAS_RANK7_BANK10               0x0A
+UMASK_RD_CAS_RANK7_BANK11               0x0B
+UMASK_RD_CAS_RANK7_BANK12               0x0C
+UMASK_RD_CAS_RANK7_BANK13               0x0D
+UMASK_RD_CAS_RANK7_BANK14               0x0E
+UMASK_RD_CAS_RANK7_BANK15               0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS             0x10
+UMASK_RD_CAS_RANK7_BANKG0               0x11
+UMASK_RD_CAS_RANK7_BANKG1               0x12
+UMASK_RD_CAS_RANK7_BANKG2               0x13
+UMASK_RD_CAS_RANK7_BANKG3               0x14
+
+EVENT_RPQ_CYCLES_NE                     0x11 MBOX
+UMASK_RPQ_CYCLES_NE                     0x00
+
+EVENT_RPQ_INSERTS                       0x10 MBOX
+UMASK_RPQ_INSERTS                       0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY             0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY             0x00
+
+EVENT_VMSE_WR_PUSH                      0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM                  0x01
+UMASK_VMSE_WR_PUSH_RMM                  0x02
+
+EVENT_WMM_TO_RMM                        0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH             0x01
+UMASK_WMM_TO_RMM_STARVE                 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY             0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS                       0x20 MBOX
+UMASK_WPQ_INSERTS                       0x00
+
+EVENT_WPQ_CYCLES_FULL                   0x22 MBOX
+UMASK_WPQ_CYCLES_FULL                   0x00
+
+EVENT_WPQ_CYCLES_NE                     0x21 MBOX
+UMASK_WPQ_CYCLES_NE                     0x00
+
+EVENT_WPQ_READ_HIT                      0x23 MBOX
+UMASK_WPQ_READ_HIT                      0x00
+
+EVENT_WPQ_WRITE_HIT                     0x24 MBOX
+UMASK_WPQ_WRITE_HIT                     0x00
+
+EVENT_WR_CAS_RANK0                      0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0                0x00
+UMASK_WR_CAS_RANK0_BANK1                0x01
+UMASK_WR_CAS_RANK0_BANK2                0x02
+UMASK_WR_CAS_RANK0_BANK3                0x03
+UMASK_WR_CAS_RANK0_BANK4                0x04
+UMASK_WR_CAS_RANK0_BANK5                0x05
+UMASK_WR_CAS_RANK0_BANK6                0x06
+UMASK_WR_CAS_RANK0_BANK7                0x07
+UMASK_WR_CAS_RANK0_BANK8                0x08
+UMASK_WR_CAS_RANK0_BANK9                0x09
+UMASK_WR_CAS_RANK0_BANK10               0x0A
+UMASK_WR_CAS_RANK0_BANK11               0x0B
+UMASK_WR_CAS_RANK0_BANK12               0x0C
+UMASK_WR_CAS_RANK0_BANK13               0x0D
+UMASK_WR_CAS_RANK0_BANK14               0x0E
+UMASK_WR_CAS_RANK0_BANK15               0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS             0x10
+UMASK_WR_CAS_RANK0_BANKG0               0x11
+UMASK_WR_CAS_RANK0_BANKG1               0x12
+UMASK_WR_CAS_RANK0_BANKG2               0x13
+UMASK_WR_CAS_RANK0_BANKG3               0x14
+
+EVENT_WR_CAS_RANK1                      0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0                0x00
+UMASK_WR_CAS_RANK1_BANK1                0x01
+UMASK_WR_CAS_RANK1_BANK2                0x02
+UMASK_WR_CAS_RANK1_BANK3                0x03
+UMASK_WR_CAS_RANK1_BANK4                0x04
+UMASK_WR_CAS_RANK1_BANK5                0x05
+UMASK_WR_CAS_RANK1_BANK6                0x06
+UMASK_WR_CAS_RANK1_BANK7                0x07
+UMASK_WR_CAS_RANK1_BANK8                0x08
+UMASK_WR_CAS_RANK1_BANK9                0x09
+UMASK_WR_CAS_RANK1_BANK10               0x0A
+UMASK_WR_CAS_RANK1_BANK11               0x0B
+UMASK_WR_CAS_RANK1_BANK12               0x0C
+UMASK_WR_CAS_RANK1_BANK13               0x0D
+UMASK_WR_CAS_RANK1_BANK14               0x0E
+UMASK_WR_CAS_RANK1_BANK15               0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS             0x10
+UMASK_WR_CAS_RANK1_BANKG0               0x11
+UMASK_WR_CAS_RANK1_BANKG1               0x12
+UMASK_WR_CAS_RANK1_BANKG2               0x13
+UMASK_WR_CAS_RANK1_BANKG3               0x14
+
+EVENT_WR_CAS_RANK2                      0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0                0x00
+UMASK_WR_CAS_RANK2_BANK1                0x01
+UMASK_WR_CAS_RANK2_BANK2                0x02
+UMASK_WR_CAS_RANK2_BANK3                0x03
+UMASK_WR_CAS_RANK2_BANK4                0x04
+UMASK_WR_CAS_RANK2_BANK5                0x05
+UMASK_WR_CAS_RANK2_BANK6                0x06
+UMASK_WR_CAS_RANK2_BANK7                0x07
+UMASK_WR_CAS_RANK2_BANK8                0x08
+UMASK_WR_CAS_RANK2_BANK9                0x09
+UMASK_WR_CAS_RANK2_BANK10               0x0A
+UMASK_WR_CAS_RANK2_BANK11               0x0B
+UMASK_WR_CAS_RANK2_BANK12               0x0C
+UMASK_WR_CAS_RANK2_BANK13               0x0D
+UMASK_WR_CAS_RANK2_BANK14               0x0E
+UMASK_WR_CAS_RANK2_BANK15               0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS             0x10
+UMASK_WR_CAS_RANK2_BANKG0               0x11
+UMASK_WR_CAS_RANK2_BANKG1               0x12
+UMASK_WR_CAS_RANK2_BANKG2               0x13
+UMASK_WR_CAS_RANK2_BANKG3               0x14
+
+EVENT_WR_CAS_RANK3                      0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0                0x00
+UMASK_WR_CAS_RANK3_BANK1                0x01
+UMASK_WR_CAS_RANK3_BANK2                0x02
+UMASK_WR_CAS_RANK3_BANK3                0x03
+UMASK_WR_CAS_RANK3_BANK4                0x04
+UMASK_WR_CAS_RANK3_BANK5                0x05
+UMASK_WR_CAS_RANK3_BANK6                0x06
+UMASK_WR_CAS_RANK3_BANK7                0x07
+UMASK_WR_CAS_RANK3_BANK8                0x08
+UMASK_WR_CAS_RANK3_BANK9                0x09
+UMASK_WR_CAS_RANK3_BANK10               0x0A
+UMASK_WR_CAS_RANK3_BANK11               0x0B
+UMASK_WR_CAS_RANK3_BANK12               0x0C
+UMASK_WR_CAS_RANK3_BANK13               0x0D
+UMASK_WR_CAS_RANK3_BANK14               0x0E
+UMASK_WR_CAS_RANK3_BANK15               0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS             0x10
+UMASK_WR_CAS_RANK3_BANKG0               0x11
+UMASK_WR_CAS_RANK3_BANKG1               0x12
+UMASK_WR_CAS_RANK3_BANKG2               0x13
+UMASK_WR_CAS_RANK3_BANKG3               0x14
+
+EVENT_WR_CAS_RANK4                      0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0                0x00
+UMASK_WR_CAS_RANK4_BANK1                0x01
+UMASK_WR_CAS_RANK4_BANK2                0x02
+UMASK_WR_CAS_RANK4_BANK3                0x03
+UMASK_WR_CAS_RANK4_BANK4                0x04
+UMASK_WR_CAS_RANK4_BANK5                0x05
+UMASK_WR_CAS_RANK4_BANK6                0x06
+UMASK_WR_CAS_RANK4_BANK7                0x07
+UMASK_WR_CAS_RANK4_BANK8                0x08
+UMASK_WR_CAS_RANK4_BANK9                0x09
+UMASK_WR_CAS_RANK4_BANK10               0x0A
+UMASK_WR_CAS_RANK4_BANK11               0x0B
+UMASK_WR_CAS_RANK4_BANK12               0x0C
+UMASK_WR_CAS_RANK4_BANK13               0x0D
+UMASK_WR_CAS_RANK4_BANK14               0x0E
+UMASK_WR_CAS_RANK4_BANK15               0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS             0x10
+UMASK_WR_CAS_RANK4_BANKG0               0x11
+UMASK_WR_CAS_RANK4_BANKG1               0x12
+UMASK_WR_CAS_RANK4_BANKG2               0x13
+UMASK_WR_CAS_RANK4_BANKG3               0x14
+
+EVENT_WR_CAS_RANK5                      0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0                0x00
+UMASK_WR_CAS_RANK5_BANK1                0x01
+UMASK_WR_CAS_RANK5_BANK2                0x02
+UMASK_WR_CAS_RANK5_BANK3                0x03
+UMASK_WR_CAS_RANK5_BANK4                0x04
+UMASK_WR_CAS_RANK5_BANK5                0x05
+UMASK_WR_CAS_RANK5_BANK6                0x06
+UMASK_WR_CAS_RANK5_BANK7                0x07
+UMASK_WR_CAS_RANK5_BANK8                0x08
+UMASK_WR_CAS_RANK5_BANK9                0x09
+UMASK_WR_CAS_RANK5_BANK10               0x0A
+UMASK_WR_CAS_RANK5_BANK11               0x0B
+UMASK_WR_CAS_RANK5_BANK12               0x0C
+UMASK_WR_CAS_RANK5_BANK13               0x0D
+UMASK_WR_CAS_RANK5_BANK14               0x0E
+UMASK_WR_CAS_RANK5_BANK15               0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS             0x10
+UMASK_WR_CAS_RANK5_BANKG0               0x11
+UMASK_WR_CAS_RANK5_BANKG1               0x12
+UMASK_WR_CAS_RANK5_BANKG2               0x13
+UMASK_WR_CAS_RANK5_BANKG3               0x14
+
+EVENT_WR_CAS_RANK6                      0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0                0x00
+UMASK_WR_CAS_RANK6_BANK1                0x01
+UMASK_WR_CAS_RANK6_BANK2                0x02
+UMASK_WR_CAS_RANK6_BANK3                0x03
+UMASK_WR_CAS_RANK6_BANK4                0x04
+UMASK_WR_CAS_RANK6_BANK5                0x05
+UMASK_WR_CAS_RANK6_BANK6                0x06
+UMASK_WR_CAS_RANK6_BANK7                0x07
+UMASK_WR_CAS_RANK6_BANK8                0x08
+UMASK_WR_CAS_RANK6_BANK9                0x09
+UMASK_WR_CAS_RANK6_BANK10               0x0A
+UMASK_WR_CAS_RANK6_BANK11               0x0B
+UMASK_WR_CAS_RANK6_BANK12               0x0C
+UMASK_WR_CAS_RANK6_BANK13               0x0D
+UMASK_WR_CAS_RANK6_BANK14               0x0E
+UMASK_WR_CAS_RANK6_BANK15               0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS             0x10
+UMASK_WR_CAS_RANK6_BANKG0               0x11
+UMASK_WR_CAS_RANK6_BANKG1               0x12
+UMASK_WR_CAS_RANK6_BANKG2               0x13
+UMASK_WR_CAS_RANK6_BANKG3               0x14
+
+EVENT_WR_CAS_RANK7                      0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0                0x00
+UMASK_WR_CAS_RANK7_BANK1                0x01
+UMASK_WR_CAS_RANK7_BANK2                0x02
+UMASK_WR_CAS_RANK7_BANK3                0x03
+UMASK_WR_CAS_RANK7_BANK4                0x04
+UMASK_WR_CAS_RANK7_BANK5                0x05
+UMASK_WR_CAS_RANK7_BANK6                0x06
+UMASK_WR_CAS_RANK7_BANK7                0x07
+UMASK_WR_CAS_RANK7_BANK8                0x08
+UMASK_WR_CAS_RANK7_BANK9                0x09
+UMASK_WR_CAS_RANK7_BANK10               0x0A
+UMASK_WR_CAS_RANK7_BANK11               0x0B
+UMASK_WR_CAS_RANK7_BANK12               0x0C
+UMASK_WR_CAS_RANK7_BANK13               0x0D
+UMASK_WR_CAS_RANK7_BANK14               0x0E
+UMASK_WR_CAS_RANK7_BANK15               0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS             0x10
+UMASK_WR_CAS_RANK7_BANKG0               0x11
+UMASK_WR_CAS_RANK7_BANKG1               0x12
+UMASK_WR_CAS_RANK7_BANKG2               0x13
+UMASK_WR_CAS_RANK7_BANKG3               0x14
+
+EVENT_PBOX_CLOCKTICKS                   0x01 PBOX
+UMASK_PBOX_CLOCKTICKS                   0x00
+
+EVENT_IIO_CREDIT                        0x2D PBOX0|PBOX1
+UMASK_IIO_CREDIT_PRQ_QPI0               0x01
+UMASK_IIO_CREDIT_PRQ_QPI1               0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0             0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1             0x08
+
+EVENT_RING_AD_USED                      0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_BOUNCES                   0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP                0x01
+UMASK_RING_AK_BOUNCES_DN                0x02
+
+EVENT_RING_AK_USED                      0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RING_IV_USED                      0x0A PBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RXR_CYCLES_NE                     0x10 PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_NCB                 0x10
+UMASK_RXR_CYCLES_NE_NCS                 0x20
+
+EVENT_RXR_INSERTS                       0x11 PBOX0|PBOX1
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_OCCUPANCY                     0x13 PBOX0
+UMASK_RXR_OCCUPANCY_DRS                 0x08
+
+EVENT_TXR_CYCLES_FULL                   0x25 PBOX0
+UMASK_TXR_CYCLES_FULL_AD                0x01
+UMASK_TXR_CYCLES_FULL_AK                0x02
+UMASK_TXR_CYCLES_FULL_BL                0x04
+
+EVENT_TXR_CYCLES_NE                     0x23 PBOX0
+UMASK_TXR_CYCLES_NE_AD                  0x01
+UMASK_TXR_CYCLES_NE_AK                  0x02
+UMASK_TXR_CYCLES_NE_BL                  0x04
+
+EVENT_TXR_NACK_CW                       0x26 PBOX0|PBOX1
+UMASK_TXR_NACK_CW_DN_AD                 0x01
+UMASK_TXR_NACK_CW_DN_BL                 0x02
+UMASK_TXR_NACK_CW_DN_AK                 0x04
+UMASK_TXR_NACK_CW_UP_AD                 0x08
+UMASK_TXR_NACK_CW_UP_BL                 0x10
+UMASK_TXR_NACK_CW_UP_AK                 0x20
+
+EVENT_IBOX_CLOCKTICKS                   0x00 IBOX
+UMASK_IBOX_CLOCKTICKS                   0x00
+
+EVENT_CACHE_TOTAL_OCCUPANCY             0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY         0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE      0x02
+
+EVENT_COHERENT_OPS                      0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR             0x01
+UMASK_COHERENT_OPS_CRD                  0x02
+UMASK_COHERENT_OPS_DRD                  0x04
+UMASK_COHERENT_OPS_RFO                  0x08
+UMASK_COHERENT_OPS_PCITOM               0x10
+UMASK_COHERENT_OPS_PCIDCAHINT           0x20
+UMASK_COHERENT_OPS_WBMTOI               0x40
+UMASK_COHERENT_OPS_CLFLUSH              0x80
+
+EVENT_MISC0                             0x14 IBOX
+UMASK_MISC0_FAST_REQ                    0x01
+UMASK_MISC0_FAST_REJ                    0x02
+UMASK_MISC0_2ND_RD_INSERT               0x04
+UMASK_MISC0_2ND_WR_INSERT               0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT           0x10
+UMASK_MISC0_INSERTS                     0x1C
+UMASK_MISC0_FAST_XFER                   0x20
+UMASK_MISC0_PF_ACK_HINT                 0x40
+UMASK_MISC0_PF_TIMEOUT                  0x80
+
+EVENT_MISC1                             0x15 IBOX
+UMASK_MISC1_SLOW_I                      0x01
+UMASK_MISC1_SLOW_S                      0x02
+UMASK_MISC1_SLOW_E                      0x04
+UMASK_MISC1_SLOW_M                      0x08
+UMASK_MISC1_SLOW                        0x0F
+UMASK_MISC1_LOST_FWD                    0x10
+UMASK_MISC1_SEC_RCVD_INVLD              0x20
+UMASK_MISC1_SEC_RCVD_VLD                0x40
+UMASK_MISC1_DATA_THROTTLE               0x80
+
+EVENT_SNOOP_RESP                        0x17 IBOX
+UMASK_SNOOP_RESP_MISS                   0x01
+UMASK_SNOOP_RESP_HIT_I                  0x02
+UMASK_SNOOP_RESP_HIT_ES                 0x04
+UMASK_SNOOP_RESP_HIT_M                  0x08
+UMASK_SNOOP_RESP_HIT                    0x0E
+UMASK_SNOOP_RESP_SNPCODE                0x10
+UMASK_SNOOP_RESP_SNPDATA                0x20
+UMASK_SNOOP_RESP_SNPINV                 0x40
+
+EVENT_TRANSACTIONS                      0x16 IBOX
+UMASK_TRANSACTIONS_READS                0x01
+UMASK_TRANSACTIONS_WRITES               0x02
+UMASK_TRANSACTIONS_RD_PREF              0x04
+UMASK_TRANSACTIONS_WR_PREF              0x08
+UMASK_TRANSACTIONS_ALL_READS            0x05
+UMASK_TRANSACTIONS_ALL_WRITES           0x0A
+UMASK_TRANSACTIONS_ATOMIC               0x10
+UMASK_TRANSACTIONS_OTHER                0x20
+UMASK_TRANSACTIONS_ORDERINGQ            0x40
+
+EVENT_RXR_AK_INSERTS                    0x0A IBOX
+UMASK_RXR_AK_INSERTS                    0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL            0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_DRS_INSERTS                0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS                0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY              0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL            0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCB_INSERTS                0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS                0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY              0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL            0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCS_INSERTS                0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS                0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY              0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY              0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES        0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES        0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_DATA_INSERTS_NCB              0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB              0x00
+
+EVENT_TXR_DATA_INSERTS_NCS              0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS              0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY             0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY             0x00
+
+
diff --git a/src/includes/perfmon_core2.h b/src/includes/perfmon_core2.h
index f737dda..9c4ba1d 100644
--- a/src/includes/perfmon_core2.h
+++ b/src/includes/perfmon_core2.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_core2.h
  *
- *      Description:  Header file of perfmon module for Core 2
+ *      Description:  Header file of perfmon module for Intel Core 2
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,155 +30,307 @@
  */
 
 #include <perfmon_core2_events.h>
-#include <perfmon_core2_groups.h>
 #include <perfmon_core2_counters.h>
+#include <error.h>
+
 
 static int perfmon_numCountersCore2 = NUM_COUNTERS_CORE2;
-static int perfmon_numGroupsCore2 = NUM_GROUPS_CORE2;
 static int perfmon_numArchEventsCore2 = NUM_ARCH_EVENTS_CORE2;
 
-void perfmon_init_core2(PerfmonThread *thread)
+int perfmon_init_core2(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
 
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
 
-    /* always initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x22ULL);
+uint32_t core2_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-    /* Preinit of PMC counters */
-    flags |= (1<<16);  /* user mode flag */
-    flags |= (1<<19);  /* pin control flag */
-    flags |= (1<<22);  /* enable flag */
+int core2_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
+    flags = (1ULL<<22)|(1ULL<<16)|(1ULL<<19);
+    flags |= (event->umask<<8) + event->eventId;
+    if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_core2(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_setupCounterThread_core2( int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t reg = core2_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ( core2_counter_map[index].type == PMC )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        flags = (1<<16)|(1<<19)|(1<<22);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-        if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+            continue;
         }
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            case PMC:
+                core2_pmc_setup(cpu_id, index, event);
+                break;
+            case FIXED:
+                fixed_flags |= core2_fixed_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
         }
     }
-    else if (core2_counter_map[index].type == FIXED)
+    if (fixed_flags > 0x0ULL)
     {
-        fixed_flags |= (0x2 << (index*4));
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-void perfmon_startCountersThread_core2(int thread_id)
+int perfmon_startCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t flags = 0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            msr_write(cpu_id, core2_counter_map[i].counterRegister , 0x0ULL);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
 
-            if (core2_counter_map[i].type == PMC)
+            if (type == PMC)
             {
-                flags |= (1<<(i-2));  /* enable counter */
+                flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));  /* enable counter */
             }
-            else if (core2_counter_map[i].type == FIXED)
+            else if (type == FIXED)
             {
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                flags |= (1ULL<<(index + 32));  /* enable fixed counter */
             }
         }
     }
 
-    if (perfmon_verbose)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , Flags: 0x%llX \n",
-                MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
     }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x300000003ULL);
+    return 0;
 }
 
-void perfmon_stopCountersThread_core2(int thread_id)
+#define CORE2_CHECK_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+    }
+
+int perfmon_stopCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     /* stop counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
     /* read out counter results */
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index + 32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
+
+    return 0;
+}
+
+int perfmon_readCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result;
+    uint64_t flags;
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        counter_result = 0x0ULL;
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, core2_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - 32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    /* check overflow status */
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        printf ("Overflow occured \n");
-        printf ("Status: 0x%llX \n", LLU_CAST flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
+
+    return 0;
 }
 
-void perfmon_readCountersThread_core2(int thread_id)
+
+int perfmon_finalizeCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        if (type == PMC)
+        {
+            ovf_values_core |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));
+        }
+        else if (type == FIXED)
+        {
+            ovf_values_core |= (1ULL<<(index + 32));
+        }
+        if ((reg) && ((type == PMC)||(type == FIXED)))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, core2_counter_map[i].counterRegister);
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
         }
     }
+    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    return 0;
 }
-
diff --git a/src/includes/perfmon_core2_counters.h b/src/includes/perfmon_core2_counters.h
index d6c33fb..2dada93 100644
--- a/src/includes/perfmon_core2_counters.h
+++ b/src/includes/perfmon_core2_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_core2_counters.h
  *
- *      Description:  Counter header file of perfmon module for Core 2
+ *      Description:  Counter header file of perfmon module for Intel Core 2
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,13 +32,21 @@
 #define NUM_COUNTERS_CORE2 5
 #define NUM_COUNTERS_CORE_CORE2 5
 
-static PerfmonCounterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
+#define CORE2_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK
+#define CORE2_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, CORE2_VALID_OPTIONS_FIXED},
     /* PMC Counters: 2 40bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, CORE2_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, CORE2_VALID_OPTIONS_PMC},
 };
 
+
+static BoxMap core2_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 40},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_core2_events.txt b/src/includes/perfmon_core2_events.txt
index 60c6211..ebb2dc5 100644
--- a/src/includes/perfmon_core2_events.txt
+++ b/src/includes/perfmon_core2_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_core2_events.txt
-# 
+#
 #      Description:  Event list for Intel Core 2
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -29,10 +30,10 @@
 EVENT_INSTR_RETIRED              0x00   FIXC0
 UMASK_INSTR_RETIRED_ANY          0x00
 
-EVENT_CPU_CLK_UNHALTED_CORE      0x00   FIXC1
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
 UMASK_CPU_CLK_UNHALTED_CORE      0x00
 
-EVENT_CPU_CLK_UNHALTED_REF       0x00   FIXC2
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
 UMASK_CPU_CLK_UNHALTED_REF       0x00
 
 EVENT_LOAD_BLOCK                 0x03   PMC
@@ -42,11 +43,16 @@ UMASK_LOAD_BLOCK_OVERLAP_STORE   0x08
 UMASK_LOAD_BLOCK_UNTIL_RETIRE    0x10
 UMASK_LOAD_BLOCK_L1D             0x20
 
-EVENT_STORE_BLOCK                0x04   PMC
+EVENT_SB_DRAIN_CYCLES            0x04   PMC
 UMASK_SB_DRAIN_CYCLES            0x01
+
+EVENT_STORE_BLOCK                0x04   PMC
 UMASK_STORE_BLOCK_ORDER          0x02
 UMASK_STORE_BLOCK_SNOOP          0x08
 
+EVENT_MISALIGN_MEM_REF           0x05   PMC
+UMASK_MISALIGN_MEM_REF           0x00
+
 EVENT_SEGMENT_REG_LOADS          0x06   PMC
 UMASK_SEGMENT_REG_LOADS          0x00
 
@@ -97,6 +103,10 @@ EVENT_L2_ADS                     0x21   PMC
 UMASK_L2_ADS_ALL_CORES           0xC0
 UMASK_L2_ADS_THIS_CORE           0x40
 
+EVENT_L2_DBUS_BUSY               0x22  PMC
+UMASK_L2_DBUS_BUSY_ALL_CORES     0xC0
+UMASK_L2_DBUS_BUSY_THIS_CORE     0x40
+
 EVENT_L2_DBUS_BUSY_RD            0x23  PMC
 UMASK_L2_DBUS_BUSY_RD_ALL_CORES  0xC0
 UMASK_L2_DBUS_BUSY_RD_THIS_CORE  0x40
@@ -266,7 +276,8 @@ UMASK_L2_NO_REQ_ALL_CORES        0xC0
 UMASK_L2_NO_REQ_THIS_CORE        0x40
 
 EVENT_EIST_TRANS               0x3A  PMC
-UMASK_EIST_TRANS               0x00
+UMASK_EIST_TRANS_ANY           0x00
+UMASK_EIST_TRANS_FREQ          0x01
 
 EVENT_THERMAL_TRIP               0x3B  PMC
 UMASK_THERMAL_TRIP               0xC0
@@ -275,6 +286,8 @@ EVENT_CPU_CLK_UNHALTED                   0x3C  PMC
 UMASK_CPU_CLK_UNHALTED_CORE_P            0x00
 UMASK_CPU_CLK_UNHALTED_BUS               0x01
 UMASK_CPU_CLK_UNHALTED_NO_OTHER          0x02
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLK_UNHALTED_TOTAL_CYCLES    0x00
 
 EVENT_L1D_CACHE_LD               0x40  PMC
 UMASK_L1D_CACHE_LD_MODIFIED     0x08
@@ -298,9 +311,11 @@ UMASK_L1D_CACHE_LOCK_INVALID      0x01
 UMASK_L1D_CACHE_LOCK_MESI         0x0F
 UMASK_L1D_CACHE_LOCK_DURATION     0x10
 
-EVENT_L1D                        0x43  PMC
+EVENT_L1D_ALL                    0x43  PMC
 UMASK_L1D_ALL_REF                0x01
-UMASK_L1D_ALL_CACHE_REF          0x02
+
+EVENT_L1D_CACHE                  0x44  PMC
+UMASK_L1D_CACHE_REF              0x02
 
 EVENT_L1D_REPL                   0x45  PMC
 UMASK_L1D_REPL                   0x0F
@@ -322,6 +337,7 @@ EVENT_SSE_PRE_MISS               0x4B  PMC
 UMASK_SSE_PRE_MISS_NTA           0x00
 UMASK_SSE_PRE_MISS_L1            0x01
 UMASK_SSE_PRE_MISS_L2            0x02
+UMASK_SSE_PRE_MISS_ALL_CACHES    0x03
 
 EVENT_LOAD_HIT_PRE              0x4C  PMC
 UMASK_LOAD_HIT_PRE              0x00
@@ -329,6 +345,9 @@ UMASK_LOAD_HIT_PRE              0x00
 EVENT_L1D_PREFETCH_REQUESTS     0x4E  PMC
 UMASK_L1D_PREFETCH_REQUESTS     0x10
 
+EVENT_L1D_PREFETCH_DCU_MISSES   0x4F  PMC
+UMASK_L1D_PREFETCH_DCU_MISSES   0x00
+
 EVENT_BUS_REQUEST_OUTSTANDING    0x60  PMC
 UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_THIS_A    0xC0
 UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_ALL_A     0xE0
@@ -425,19 +444,56 @@ UMASK_BUS_TRANS_ANY_ALL_CORES_ALL_A     0xE0
 UMASK_BUS_TRANS_ANY_THIS_CORE_THIS_A    0x40
 UMASK_BUS_TRANS_ANY_THIS_CORE_ALL_A     0x60
 
+EVENT_EXT_SNOOP                         0x77 PMC
+UMASK_EXT_SNOOP_ALL_CORES_CLEAN         0xC1
+UMASK_EXT_SNOOP_ALL_CORES_HIT           0xC2
+UMASK_EXT_SNOOP_ALL_CORES_HITM          0xC8
+UMASK_EXT_SNOOP_THIS_CORE_CLEAN         0x41
+UMASK_EXT_SNOOP_THIS_CORE_HIT           0x42
+UMASK_EXT_SNOOP_THIS_CORE_HITM          0x48
+
+
+EVENT_CMP_SNOOP                         0x78 PMC
+UMASK_CMP_SNOOP_ALL_CORES_CMP2I         0xC2
+UMASK_CMP_SNOOP_ALL_CORES_CMP2S         0xC1
+UMASK_CMP_SNOOP_THIS_CORE_CMP2I         0x42
+UMASK_CMP_SNOOP_THIS_CORE_CMP2S         0x41
+
+EVENT_BUS_HIT_DRV                         0x7A PMC
+UMASK_BUS_HIT_DRV_THIS_AGENT              0x00
+UMASK_BUS_HIT_DRV_ALL_AGENTS              0x20
+
+EVENT_BUS_HITM_DRV                         0x7B PMC
+UMASK_BUS_HITM_DRV_THIS_AGENT              0x00
+UMASK_BUS_HITM_DRV_ALL_AGENTS              0x20
+
+EVENT_BUSQ_EMPTY                         0x7D PMC
+UMASK_BUSQ_EMPTY_ALL_CORES               0xC0
+UMASK_BUSQ_EMPTY_THIS_CORE               0x40
+
+EVENT_BUS_SNOOP_STALLED          0x7E  PMC
+UMASK_BUS_SNOOP_STALLED_ALL_CORES_THIS_AGENT          0xC0
+UMASK_BUS_SNOOP_STALLED_ALL_CORES_ALL_AGENTS          0xE0
+UMASK_BUS_SNOOP_STALLED_THIS_CORE_THIS_AGENT          0x40
+UMASK_BUS_SNOOP_STALLED_THIS_CORE_ALL_AGENTS          0x60
+
+EVENT_BUS_IO_WAIT                         0x7F PMC
+UMASK_BUS_IO_WAIT_ALL_CORES               0xC0
+UMASK_BUS_IO_WAIT_THIS_CORE               0x40
+
 EVENT_L1I_READS                  0x80  PMC
 UMASK_L1I_READS                  0x00
 
 EVENT_L1I_MISSES                0x81  PMC
 UMASK_L1I_MISSES                0x00
 
-EVENT_ITLB                      0x82  PMC
+EVENT_ITLB                       0x82  PMC
 UMASK_ITLB_SMALL_MISS            0x02
 UMASK_ITLB_LARGE_MISS            0x10
-UMASK_ITLB_FLUSH                0x40
+UMASK_ITLB_FLUSH                 0x40
 UMASK_ITLB_MISSES                0x12
 
-EVENT_INST_QUEUE                0x83  PMC
+EVENT_INST_QUEUE                 0x83  PMC
 UMASK_INST_QUEUE_FULL            0x02
 
 EVENT_CYCLES_L1I_MEM_STALLED     0x86  PMC
@@ -491,27 +547,30 @@ UMASK_BR_TKN_BUBBLE_2          0x00
 EVENT_RS_UOPS_DISPATCHED_ALL          0xA0  PMC
 UMASK_RS_UOPS_DISPATCHED_ALL          0x00
 
-EVENT_RS_UOPS_DISPATCHED            0xA1  PMC0
+EVENT_RS_UOPS_DISPATCHED                0xA1  PMC0
 UMASK_RS_UOPS_DISPATCHED_PORT0          0x01
 UMASK_RS_UOPS_DISPATCHED_PORT1          0x02
 UMASK_RS_UOPS_DISPATCHED_PORT2          0x04
 UMASK_RS_UOPS_DISPATCHED_PORT3          0x08
 UMASK_RS_UOPS_DISPATCHED_PORT4          0x10
 UMASK_RS_UOPS_DISPATCHED_PORT5          0x20
+DEFAULT_OPTIONS_RS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_EDGE=0x1
+UMASK_RS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x23
+UMASK_RS_UOPS_DISPATCHED_PORT_DATA_PORTS 0x1C
 
-EVENT_MACRO_INSTS          0xAA  PMC
+EVENT_MACRO_INSTS                  0xAA  PMC
 UMASK_MACRO_INSTS_DECODED          0x01
 UMASK_MACRO_INSTS_CISC_DECODED     0x08
 
-EVENT_ESP                 0xAB  PMC
-UMASK_ESP_SYNCH          0x01
+EVENT_ESP                    0xAB  PMC
+UMASK_ESP_SYNCH              0x01
 UMASK_ESP_ADDITIONS          0x02
 
 EVENT_SIMD_UOPS_EXEC            0xB0  PMC
-UMASK_SIMD_UOPS_EXEC          0x00
+UMASK_SIMD_UOPS_EXEC            0x00
 
 EVENT_SIMD_SAT_UOPS_EXEC            0xB1  PMC
-UMASK_SIMD_SAT_UOPS_EXEC          0x00
+UMASK_SIMD_SAT_UOPS_EXEC            0x00
 
 EVENT_SIMD_UOP_TYPE_EXEC               0xB3  PMC
 UMASK_SIMD_UOP_TYPE_EXEC_MUL           0x01
@@ -531,20 +590,28 @@ EVENT_X87_OPS_RETIRED           0xC1   PMC
 UMASK_X87_OPS_RETIRED_FXCH      0x01
 UMASK_X87_OPS_RETIRED_ANY       0xFE
 
-EVENT_UOPS_RETIRED_ANY           0xC2   PMC
+EVENT_UOPS_RETIRED               0xC2   PMC
 UMASK_UOPS_RETIRED_LD_IND_BR     0x01
 UMASK_UOPS_RETIRED_STD_STA       0x02
 UMASK_UOPS_RETIRED_MACRO_FUSION  0x04
 UMASK_UOPS_RETIRED_FUSED         0x07
 UMASK_UOPS_RETIRED_NON_FUSED     0x08
 UMASK_UOPS_RETIRED_ANY           0x0F
-
-EVENT_MACHINE_NUKES            0xC3      PMC
-UMASK_MACHINE_NUKES_SMC       0x01
-UMASK_MACHINE_NUKES_MEM_ORDER       0x04
-
-EVENT_BR_INST_RETIRED            0xC4   PMC
-UMASK_BR_INST_RETIRED_ANY        0x00
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES     0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES     0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x9,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES     0x0F
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_RETIRED_STALL_COUNT     0x0F
+
+EVENT_MACHINE_NUKES              0xC3      PMC
+UMASK_MACHINE_NUKES_SMC          0x01
+UMASK_MACHINE_NUKES_MEM_ORDER    0x04
+
+EVENT_BR_INST_RETIRED                       0xC4   PMC
+UMASK_BR_INST_RETIRED_ANY                   0x00
 UMASK_BR_INST_RETIRED_PRED_NOT_TAKEN        0x01
 UMASK_BR_INST_RETIRED_MISPRED_NOT_TAKEN     0x02
 UMASK_BR_INST_RETIRED_PRED_TAKEN            0x04
@@ -554,11 +621,11 @@ UMASK_BR_INST_RETIRED_TAKEN                 0x0C
 EVENT_BR_INST_RETIRED_MISPRED    0xC5   PMC
 UMASK_BR_INST_RETIRED_MISPRED    0x00
 
-EVENT_CYCLES_INT             0xC6   PMC
-UMASK_CYCLES_INT_MASKED    0x01
+EVENT_CYCLES_INT                   0xC6   PMC
+UMASK_CYCLES_INT_MASKED            0x01
 UMASK_CYCLES_INT_PENDING_MASKED    0x02
 
-EVENT_SIMD_INST_RETIRED          0xC7   PMC
+EVENT_SIMD_INST_RETIRED                     0xC7   PMC
 UMASK_SIMD_INST_RETIRED_PACKED_SINGLE       0x01
 UMASK_SIMD_INST_RETIRED_SCALAR_SINGLE       0x02
 UMASK_SIMD_INST_RETIRED_PACKED_DOUBLE       0x04
@@ -567,10 +634,10 @@ UMASK_SIMD_INST_RETIRED_VECTOR              0x10
 UMASK_SIMD_INST_RETIRED_ANY                 0x1F
 
 EVENT_HW_INT_RCV             0xC8   PMC
-UMASK_HW_INT_RCV    0x00
+UMASK_HW_INT_RCV             0x00
 
 EVENT_ITLB_MISS_RETIRED             0xC9   PMC
-UMASK_ITLB_MISS_RETIRED    0x00
+UMASK_ITLB_MISS_RETIRED             0x00
 
 EVENT_SIMD_COMP_INST_RETIRED     0xCA   PMC
 UMASK_SIMD_COMP_INST_RETIRED_PACKED_SINGLE     0x01
@@ -578,69 +645,69 @@ UMASK_SIMD_COMP_INST_RETIRED_SCALAR_SINGLE     0x02
 UMASK_SIMD_COMP_INST_RETIRED_PACKED_DOUBLE     0x04
 UMASK_SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE     0x08
 
-EVENT_MEM_LOAD_RETIRED           0xCB    PMC0
-UMASK_MEM_LOAD_RETIRED_L1D_MISS       0x01 
-UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS  0x02 
-UMASK_MEM_LOAD_RETIRED_L2_MISS        0x04 
-UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS   0x08 
-UMASK_MEM_LOAD_RETIRED_DTLB_MISS      0x10 
+EVENT_MEM_LOAD_RETIRED                0xCB    PMC0
+UMASK_MEM_LOAD_RETIRED_L1D_MISS       0x01
+UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS  0x02
+UMASK_MEM_LOAD_RETIRED_L2_MISS        0x04
+UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS   0x08
+UMASK_MEM_LOAD_RETIRED_DTLB_MISS      0x10
 
 EVENT_FP_MMX_TRANS_TO             0xCC   PMC
-UMASK_FP_MMX_TRANS_TO_MMX    0x01
-UMASK_FP_MMX_TRANS_TO_FP    0x02
+UMASK_FP_MMX_TRANS_TO_MMX         0x01
+UMASK_FP_MMX_TRANS_TO_FP          0x02
 
 EVENT_SIMD_ASSIST             0xCD   PMC
-UMASK_SIMD_ASSIST    0x00
+UMASK_SIMD_ASSIST             0x00
 
 EVENT_SIMD_INSTR_RETIRED             0xCE   PMC
-UMASK_SIMD_INSTR_RETIRED    0x00
+UMASK_SIMD_INSTR_RETIRED             0x00
 
-EVENT_SIMD_SAT_INSTR_RETIRED             0xCF   PMC
+EVENT_SIMD_SAT_INSTR_RETIRED    0xCF   PMC
 UMASK_SIMD_SAT_INSTR_RETIRED    0x00
 
-EVENT_RAT_STALLS             0xD2   PMC
+EVENT_RAT_STALLS                  0xD2   PMC
 UMASK_RAT_STALLS_ROB_READ_PORT    0x01
-UMASK_RAT_STALLS_PARTIAL_CYCLES    0x02
-UMASK_RAT_STALLS_FLAGS    0x04
-UMASK_RAT_STALLS_FPSW    0x08
-UMASK_RAT_STALLS_ANY    0x0F
+UMASK_RAT_STALLS_PARTIAL_CYCLES   0x02
+UMASK_RAT_STALLS_FLAGS            0x04
+UMASK_RAT_STALLS_FPSW             0x08
+UMASK_RAT_STALLS_ANY              0x0F
 
 EVENT_SEG_RENAME_STALLS        0xD4   PMC
-UMASK_SEG_RENAME_STALLS_ES    0x01
-UMASK_SEG_RENAME_STALLS_DS    0x02
-UMASK_SEG_RENAME_STALLS_FS    0x04
-UMASK_SEG_RENAME_STALLS_GS    0x08
-UMASK_SEG_RENAME_STALLS_ANY    0x0F
-
-EVENT_SEG_REG_RENAMES        0xD5   PMC
-UMASK_SEG_RENAME_STALLS_ES    0x01
-UMASK_SEG_RENAME_STALLS_DS    0x02
-UMASK_SEG_RENAME_STALLS_FS    0x04
-UMASK_SEG_RENAME_STALLS_GS    0x08
+UMASK_SEG_RENAME_STALLS_ES     0x01
+UMASK_SEG_RENAME_STALLS_DS     0x02
+UMASK_SEG_RENAME_STALLS_FS     0x04
+UMASK_SEG_RENAME_STALLS_GS     0x08
 UMASK_SEG_RENAME_STALLS_ANY    0x0F
 
-EVENT_RESOURCE_STALLS        0xDC   PMC
-UMASK_RESOURCE_STALLS_ROB_FULL    0x01
-UMASK_RESOURCE_STALLS_RS_FULL    0x02
-UMASK_RESOURCE_STALLS_LD_ST    0x04
-UMASK_RESOURCE_STALLS_FPCW    0x08
-UMASK_RESOURCE_STALLS_BR_MISS_CLEAR    0x10
-UMASK_RESOURCE_STALLS_ANY    0x1F
+EVENT_SEG_REG_RENAMES         0xD5   PMC
+UMASK_SEG_REG_RENAMES_STALLS_ES    0x01
+UMASK_SEG_REG_RENAMES_STALLS_DS    0x02
+UMASK_SEG_REG_RENAMES_STALLS_FS    0x04
+UMASK_SEG_REG_RENAMES_STALLS_GS    0x08
+UMASK_SEG_REG_RENAMES_STALLS_ANY   0x0F
+
+EVENT_RESOURCE_STALLS                 0xDC   PMC
+UMASK_RESOURCE_STALLS_ROB_FULL        0x01
+UMASK_RESOURCE_STALLS_RS_FULL         0x02
+UMASK_RESOURCE_STALLS_LD_ST           0x04
+UMASK_RESOURCE_STALLS_FPCW            0x08
+UMASK_RESOURCE_STALLS_BR_MISS_CLEAR   0x10
+UMASK_RESOURCE_STALLS_ANY             0x1F
 
 EVENT_BR_INST_DECODED        0xE0   PMC
-UMASK_BR_INST_DECODED    0x00
+UMASK_BR_INST_DECODED        0x00
 
 EVENT_BOGUS_BR        0xE4   PMC
-UMASK_BOGUS_BR    0x00
+UMASK_BOGUS_BR        0x00
 
 EVENT_BACLEARS        0xE6   PMC
-UMASK_BACLEARS    0x00
+UMASK_BACLEARS        0x00
 
 EVENT_PREF_RQSTS_UP        0xF0   PMC
-UMASK_PREF_RQSTS_UP    0x00
+UMASK_PREF_RQSTS_UP        0x00
 
 EVENT_PREF_RQSTS_DN        0xF8   PMC
-UMASK_PREF_RQSTS_DN    0x00
+UMASK_PREF_RQSTS_DN        0x00
 
 
 
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_haswell.h
index 57f12af..23d1b64 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_haswell.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_haswell.h
  *
- *      Description:  Header File of perfmon module for Haswell.
+ *      Description:  Header File of perfmon module for Intel Haswell.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,382 +29,1880 @@
  * =======================================================================================
  */
 
+#include <perfmon_haswellEP_events.h>
 #include <perfmon_haswell_events.h>
-#include <perfmon_haswell_groups.h>
+#include <perfmon_haswellEP_counters.h>
 #include <perfmon_haswell_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
 
+
+static int perfmon_numCountersHaswellEP = NUM_COUNTERS_HASWELL_EP;
+static int perfmon_numCoreCountersHaswellEP = NUM_COUNTERS_CORE_HASWELL_EP;
+static int perfmon_numArchEventsHaswellEP = NUM_ARCH_EVENTS_HASWELLEP;
 static int perfmon_numCountersHaswell = NUM_COUNTERS_HASWELL;
-static int perfmon_numGroupsHaswell = NUM_GROUPS_HASWELL;
+static int perfmon_numCoreCountersHaswell = NUM_COUNTERS_CORE_HASWELL;
 static int perfmon_numArchEventsHaswell = NUM_ARCH_EVENTS_HASWELL;
 
+int has_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int hasep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*haswell_cbox_setup)(int, RegisterIndex, PerfmonEvent *);
+
+int perfmon_init_haswell(int cpu_id)
+{
+    int ret;
+    uint64_t data;
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data);
+    ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+    if (cpuid_info.model == HASWELL_EP)
+    {
+        haswell_cbox_setup = hasep_cbox_setup;
+    }
+    else if ((ret == 0) && (data == 0x0ULL))
+    {
+        haswell_cbox_setup = has_cbox_setup;
+    }
+    return 0;
+}
+
+
+uint32_t hasep_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-#define OFFSET_PMC 3
 
-void perfmon_init_haswell(PerfmonThread *thread)
+int hasep_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
+    int j;
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+    uint64_t offcore_flags = 0x0ULL;
+    uint64_t latency_flags = 0x0ULL;
 
-    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
-    if (cpuid_info.model != HASWELL_EX && cpuid_info.supportUncore)
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_IN_TRANS:
+                    flags |= (1ULL<<32);
+                    break;
+                case EVENT_OPTION_IN_TRANS_ABORT:
+                    flags |= (1ULL<<33);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value << 16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int has_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter_flags;
+    uint32_t filter0 = box_map[counter_map[index].type].filterRegister1;
+    uint32_t filter1 = box_map[counter_map[index].type].filterRegister2;
+    int set_state_all = 0;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->eventId == 0x34)
+    {
+        set_state_all = 1;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            filter_flags = 0x0ULL;
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+                    filter_flags |= (0x3<<27);
+                    filter_flags |= (extractBitField(event->options[j].value,5,0) << 20);
+                    VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_OPCODE);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+                    break;
+                case EVENT_OPTION_NID:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,16,0));
+                    VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_NID);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+                    break;
+                case EVENT_OPTION_STATE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,6,0) << 17);
+                    VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_STATE);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+                    set_state_all = 0;
+                    break;
+                case EVENT_OPTION_TID:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,6,0));
+                    VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_TID);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+                    flags |= (1ULL<<19);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (set_state_all)
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+        filter_flags |= (0x1F << 17);
+        VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_DEF_FILTER_STATE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    int clean_filter_reg = 1;
+    uint64_t filter = box_map[counter_map[index].type].filterRegister1;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= event->eventId;
+    if ((event->umask > 0x00) && (event->umask <= 0x3))
+    {
+        flags |= (event->umask << 14);
+    }
+    else if (event->umask == 0xFF)
+    {
+        flags = (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                case EVENT_OPTION_OCCUPANCY:
+                    flags |= ((event->options[j].value & 0x3ULL)<<14);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_FILTER:
+                    clean_filter_reg = 0;
+                    VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), SETUP_WBOX_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, (event->options[j].value & 0xFFFFFFFFULL)));
+                    break;
+                case EVENT_OPTION_OCCUPANCY_EDGE:
+                    flags |= (1ULL<<31);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_INVERT:
+                    flags |= (1ULL<<30);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (clean_filter_reg)
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, 0x0ULL));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+
+int hasep_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter = 0x0ULL;
+    int opcode_flag = 0;
+    int match_flag = 0;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags |= (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL)));
+                    opcode_flag = 1;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+                    filter = (((event->options[j].value>>32) & 0x3FFFULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+                    match_flag = 1;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (!opcode_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL, CLEAR_BBOX_OPCODE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL));
+    }
+    if (!match_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL, CLEAR_BBOX_MATCH0);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL));
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL, CLEAR_BBOX_MATCH1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL));
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_BBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite( cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable.
+         * Not mentioned for the BBOX but we do it to be sure.
+         */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (1ULL<<19);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX_TWICE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+
+int hasep_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_MBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_IBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+
+int hasep_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int hasep_qbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filterreg;
+    uint64_t filterval = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits == 0x01)
     {
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, 0xAA);
-        flags = msr_read(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0);
-        if (flags != 0xAA)
+        flags |= (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
         {
-            fprintf(stdout, "The current system does not support Uncore MSRs, deactivating Uncore support\n");
-            cpuid_info.supportUncore = 0;
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH2:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH3:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK2:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK3:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                default:
+                    break;
+            }
         }
     }
-
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) && (cpuid_info.supportUncore))
+    if ((flags|(1ULL<<22)) != currentConfig[cpu_id][index])
     {
-        flags = 0x0ULL;
-        flags = (1ULL<<22)|(1ULL<<20);
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL1, flags);
-
-        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL1, flags);
-
-        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTRL, flags);
-
-        msr_write(cpu_id, MSR_UNC_CBO_0_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_0_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_1_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_1_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_2_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_2_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_3_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_3_CTR1, 0x0ULL);
-
-        msr_write(cpu_id, MSR_UNC_ARB_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_ARB_CTR1, 0x0ULL);
-
-        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTR, 0x0ULL);
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_QBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+        flags |= (1ULL<<22);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_QBOX_TWICE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
     }
+    return 0;
 }
 
-#define HAS_SETUP_BOX \
-    if (haveLock) \
+#define HASEP_FREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL) && cpuid_info.model == HASWELL_EP) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+    } \
+    else if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        uint64_t data = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data)); \
+        if (!(data & (1ULL<<29))) \
+        { \
+            data &= ~(1ULL<<29); \
+            VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, data, FREEZE_UNCORE); \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, data)); \
+        } \
+    } \
+
+#define HASEP_UNFREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL) && cpuid_info.model == HASWELL_EP) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+    } \
+    else if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        uint64_t data = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &data)); \
+        data |= (1ULL<<29); \
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, data, UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, data)); \
+    }
+
+#define HASEP_UNFREEZE_UNCORE_AND_RESET_CTR \
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL))) \
+    { \
+        for (int i=0;i < eventSet->numberOfEvents;i++) \
+        { \
+            RegisterIndex index = eventSet->events[i].index; \
+            RegisterType type = counter_map[index].type; \
+            if ((type < UNCORE) || (type == WBOX0FIX)) \
+            { \
+                continue; \
+            } \
+            PciDeviceIndex dev = counter_map[index].device; \
+            if (HPMcheck(dev, cpu_id)) { \
+                int err = 0; \
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_MANUAL); \
+                err = HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL); \
+                if (err != 0) \
+                { \
+                    eventSet->events[index].type = NOTYPE; \
+                } \
+                else if (counter_map[index].counterRegister2 != 0x0) \
+                { \
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR_MANUAL); \
+                    err = HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL); \
+                    if (err != 0) \
+                    { \
+                        eventSet->events[index].type = NOTYPE; \
+                    } \
+                } \
+            } \
+        } \
+        HASEP_UNFREEZE_UNCORE; \
+    }
+
+#define HASEP_FREEZE_UNCORE_AND_RESET_CTL \
+    if (haveLock && (eventSet->regTypeMask & ~(REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)|REG_TYPE_MASK(THERMAL)|REG_TYPE_MASK(POWER)))) \
     { \
-        flags = (1ULL<<22)|(1ULL<<20); \
-        flags |= (event->umask<<8) + event->eventId; \
-        if (event->cfgBits != 0) /* set custom cfg and cmask */ \
+        HASEP_FREEZE_UNCORE; \
+        for (int i=0;i < eventSet->numberOfEvents;i++) \
         { \
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */ \
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16; \
+            RegisterIndex index = eventSet->events[i].index; \
+            RegisterType type = counter_map[index].type; \
+            if ((type < UNCORE) || (type == WBOX0FIX)) \
+            { \
+                continue; \
+            } \
+            PciDeviceIndex dev = counter_map[index].device; \
+            if (HPMcheck(dev, cpu_id)) { \
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, 0x0ULL, CLEAR_CTL_MANUAL); \
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+                if ((type >= SBOX0) && (type <= SBOX3)) { \
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+                } \
+                if (box_map[type].filterRegister1 != 0x0) \
+                { \
+                    VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL, CLEAR_FILTER); \
+                    HPMwrite(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL); \
+                } \
+                if (box_map[type].filterRegister2 != 0x0) \
+                { \
+                    VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL, CLEAR_FILTER); \
+                    HPMwrite(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL); \
+                } \
+            } \
         } \
-        msr_write(cpu_id, reg , flags); \
     }
 
-void perfmon_setupCounterThread_haswell(
+
+
+
+int perfmon_setupCounterThread_haswell(
         int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+        PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags;
-    uint64_t reg = haswell_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t flags;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    switch (haswell_counter_map[index].type)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+
+    HASEP_FREEZE_UNCORE;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        case PMC:
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                hasep_pmc_setup(cpu_id, index, event);
+                break;
 
-            flags = (1<<22)|(1<<16);
+            case FIXED:
+                fixed_flags |= hasep_fixed_setup(cpu_id, index, event);
+                break;
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+            case POWER:
+                break;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+            case CBOX10:
+            case CBOX11:
+            case CBOX12:
+            case CBOX13:
+            case CBOX14:
+            case CBOX15:
+            case CBOX16:
+            case CBOX17:
+                haswell_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                hasep_ubox_setup(cpu_id, index, event);
+                break;
+            case UBOXFIX:
+                if (haveLock)
+                {
+                    flags = (1ULL<<22)|(1ULL<<20);
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOXFIX);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                }
+                break;
+
+            case SBOX0:
+            case SBOX1:
+            case SBOX2:
+            case SBOX3:
+                hasep_sbox_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                hasep_bbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                hasep_wbox_setup(cpu_id, index, event);
+                break;
+            case WBOX0FIX:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+            case MBOX4:
+            case MBOX5:
+            case MBOX6:
+            case MBOX7:
+                hasep_mbox_setup(cpu_id, index, event);
+                break;
+             case MBOX0FIX:
+             case MBOX1FIX:
+             case MBOX2FIX:
+             case MBOX3FIX:
+             case MBOX4FIX:
+             case MBOX5FIX:
+             case MBOX6FIX:
+             case MBOX7FIX:
+                 if (haveLock && HPMcheck(counter_map[index].device, cpu_id))
+                 {
+                     CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, reg, ((1ULL<<20)|(1ULL<<22))))
+                 }
+                 break;
+
+            case PBOX:
+                hasep_pbox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+            case RBOX1:
+                hasep_rbox_setup(cpu_id, index, event);
+                break;
+
+            case QBOX0:
+                hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case QBOX1:
+                hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+
+            case IBOX0:
+            case IBOX1:
+                hasep_ibox_setup(cpu_id, index, event);
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0ULL)
+    {
+        // Erratum HSW143
+        //VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED_WORKAROUND)
+        //CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, (1ULL<<32)));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
+
+int perfmon_startCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
-            if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                continue;
             }
-            msr_write(cpu_id, reg , flags);
-            break;
-
-        case FIXED:
-            fixed_flags |= (0x2 << (index*4));
-            break;
-
-        case POWER:
-            break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case UBOX:
-            if (cpuid_info.supportUncore)
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
             {
-                HAS_SETUP_BOX;
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, START_PMC);
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, START_FIXED);
+                    break;
+
+                case POWER:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_WBOXFIX);
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    if (haveLock && HPMcheck(dev, cpu_id))
+                    {
+                        if (eventSet->events[i].event.eventId != 0x00)
+                        {
+                            CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                            VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_QBOXFIX);
+                            eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                        }
+                    }
+                    break;
+
+                default:
+                    break;
             }
-            break;
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+        }
+    }
 
-        default:
-            /* should never be reached */
-            break;
+    HASEP_UNFREEZE_UNCORE_AND_RESET_CTR;
+    
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int has_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+                     uint64_t* cur_result, int* overflows, int flags,
+                     int global_offset, int box_offset)
+{
+    uint64_t result = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    uint64_t counter1 = counter_map[index].counterRegister;
+    uint64_t counter2 = counter_map[index].counterRegister2;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &result));
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST result, READ_REG_1);
+    if (flags & FREEZE_FLAG_CLEAR_CTR)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST 0x0U, CLEAR_PCI_REG_1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+    }
+    if (counter2 != 0x0)
+    {
+        result <<= 32;
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST tmp, READ_REG_2);
+        result += tmp;
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST 0x0U, CLEAR_PCI_REG_2);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    result = field64(result, 0, box_map[type].regWidth);
+
+    if (result < *cur_result)
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        uint64_t ovf_values = 0x0ULL;
+        int global_offset = box_map[type].ovflOffset;
+        int test_local = 0;
+        if (global_offset != -1)
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                           MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                           &ovf_values));
+            VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values, READ_GLOBAL_OVFL);
+            if (ovf_values & (1<<global_offset))
+            {
+                VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST (1<<global_offset), CLEAR_GLOBAL_OVFL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                 MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                                 (1<<global_offset)));
+                test_local = 1;
+            }
+        }
+        else
+        {
+            test_local = 1;
+        }
+
+        if (test_local)
+        {
+            ovf_values = 0x0ULL;
+            CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
+            VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST ovf_values, READ_BOX_OVFL);
+            if (ovf_values & (1<<box_offset))
+            {
+                (*overflows)++;
+                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST (1<<box_offset), RESET_BOX_OVFL);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+                                                    box_map[type].statusRegister,
+                                                    (1<<box_offset)));
+            }
+        }
     }
+    *cur_result = result;
+    return 0;
 }
 
-void perfmon_startCountersThread_haswell(int thread_id)
+#define HASEP_CHECK_CORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+    }
+
+
+#define HASEP_CHECK_LOCAL_OVERFLOW \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        uint64_t offset = getCounterTypeOffset(eventSet->events[i].index); \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, (1ULL<<offset))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    int start_uncore = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    HASEP_FREEZE_UNCORE;
 
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (haswell_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            int ovf_offset = box_map[type].ovflOffset;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_PMC)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_FIXED)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, haswell_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER)
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
                     }
                     break;
 
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case PBOX:
+                case IBOX0:
+                case RBOX0:
+                case RBOX1:
+                case QBOX0:
+                case QBOX1:
+                case WBOX:
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                case UBOX:
+                case UBOXFIX:
                 case CBOX0:
                 case CBOX1:
                 case CBOX2:
                 case CBOX3:
-                case UBOX:
-                    start_uncore = 1;
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case CBOX16:
+                case CBOX17:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
+                        *current = counter_result;
+                    }
+                    break;
+
+                case BBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+                    break;
+                case BBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, 0);
+                    break;
+
+                case IBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, ovf_offset, getCounterTypeOffset(index)+2);
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, STOP_QBOXFIX);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
-    if (haveLock && start_uncore && cpuid_info.supportUncore)
-    {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29));
-    }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    return 0;
 }
 
-void perfmon_stopCountersThread_haswell(int thread_id)
+
+int perfmon_readCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t tmp;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
-    uint64_t counter_result = 0x0ULL;
+    uint64_t flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    if (haveLock && cpuid_info.supportUncore)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
     }
 
-    for ( int i=0; i < perfmon_numCountersHaswell; i++ ) 
+    HASEP_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (haswell_counter_map[i].type)
+            counter_result= 0x0ULL;
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            int ovf_offset = box_map[type].ovflOffset;
+            switch (type)
             {
                 case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
 
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, haswell_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST eventSet->events[i].threadCounter[thread_id].startData, OVERFLOW_POWER_START)
+                            VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, OVERFLOW_POWER_STOP)
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_TEMP)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_WBOXFIX)
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
+                        *current = counter_result;
+                    }
+                    break;
+
+                case BBOX0:
+                case BBOX1:
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_BBOX)
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_MBOX)
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_MBOXFIX)
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, 0);
+                    break;
+
+                case IBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index)+2);
                     break;
 
+                case PBOX:
+                case IBOX0:
+                case RBOX0:
+                case RBOX1:
+                case QBOX0:
+                case QBOX1:
+                case WBOX:
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                case UBOX:
+                case UBOXFIX:
                 case CBOX0:
                 case CBOX1:
                 case CBOX2:
                 case CBOX3:
-                case UBOX:
-                    if(haveLock && cpuid_info.supportUncore)
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case CBOX16:
+                case CBOX17:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, ovf_offset, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_QBOXFIX)
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
                     }
+                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    HASEP_UNFREEZE_UNCORE;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        printf ("Overflow occured \n");
+        // Erratum HSW143
+        //VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS_WORKAROUND)
+        //CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, (1ULL<<32)));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
+
+    return 0;
 }
 
-void perfmon_readCountersThread_haswell(int thread_id)
+int perfmon_finalizeCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t core_flags = 0x0ULL;
-    uint64_t uncore_flags = 0x0ULL;
+    int haveTileLock = 0;
+    int clearPBS = 0;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+    uint64_t ovf_values_uncore = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
-
-    core_flags = msr_read(cpu_id, MSR_PERF_GLOBAL_CTRL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    if (cpuid_info.supportUncore)
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        uncore_flags = msr_read(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+        haveTileLock = 1;
     }
-
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            if ((haswell_counter_map[i].type == PMC) ||
-                    (haswell_counter_map[i].type == FIXED))
-            {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
-            }
-            else
-            {
-                if(haveLock)
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
                 {
-                    switch (haswell_counter_map[i].type)
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                /*if (counter_map[index].type > UNCORE)
+                {
+                    if (box_map[counter_map[index].type].ovflOffset >= 0)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, haswell_counter_map[i].counterRegister);
-                            break;
-
-                        case CBOX0:
-                        case CBOX1:
-                        case CBOX2:
-                        case CBOX3:
-                        case UBOX:
-                            if(haveLock)
-                            {
-                                perfmon_threadData[thread_id].counters[i].counterData =
-                                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
-                            }
-                            break;
-                        default:
-                            /* should never be reached */
-                            break;
+                        ovf_values_uncore |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
                     }
-                }
+                }*/
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            ovf_values_uncore = 0x0ULL;
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            if ((type >= SBOX0) && (type <= SBOX3))
+            {
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+            if (box_map[type].filterRegister1)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL, CLEAR_FILTER);
+                HPMwrite(cpu_id, dev, box_map[type].filterRegister1, 0x0ULL);
+            }
+            if (box_map[type].filterRegister2)
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL, CLEAR_FILTER);
+                HPMwrite(cpu_id, dev, box_map[type].filterRegister2, 0x0ULL);
             }
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
-    if (cpuid_info.supportUncore && uncore_flags > 0x0ULL)
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
     {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, uncore_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_uncore, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
     }
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, core_flags);
-}
 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_haswellEP_counters.h b/src/includes/perfmon_haswellEP_counters.h
new file mode 100644
index 0000000..0c93c91
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_counters.h
@@ -0,0 +1,330 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_haswellEP_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Intel Haswell EP/EN/EX.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_HASWELL_EP 187
+#define NUM_COUNTERS_CORE_HASWELL_EP 8
+#define NUM_COUNTERS_UNCORE_HASWELL_EP 111
+
+#define HAS_EP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_EP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define HAS_EP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_QBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap haswellEP_counter_map[NUM_COUNTERS_HASWELL_EP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_EP_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC44, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC45, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC46, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC48, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC49, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC50, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC52, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC53, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC54, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC56, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC57, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC58, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC60, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC61, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC62, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC64, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC65, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC66, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC68, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC69, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC70, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C0", PMC72, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C1", PMC73, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C2", PMC74, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C3", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C0", PMC76, CBOX16, MSR_UNC_V3_C16_PMON_CTL0, MSR_UNC_V3_C16_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C1", PMC77, CBOX16, MSR_UNC_V3_C16_PMON_CTL1, MSR_UNC_V3_C16_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C2", PMC78, CBOX16, MSR_UNC_V3_C16_PMON_CTL2, MSR_UNC_V3_C16_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C3", PMC79, CBOX16, MSR_UNC_V3_C16_PMON_CTL3, MSR_UNC_V3_C16_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C0", PMC80, CBOX17, MSR_UNC_V3_C17_PMON_CTL0, MSR_UNC_V3_C17_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C1", PMC81, CBOX17, MSR_UNC_V3_C17_PMON_CTL1, MSR_UNC_V3_C17_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C2", PMC82, CBOX17, MSR_UNC_V3_C17_PMON_CTL2, MSR_UNC_V3_C17_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C3", PMC83, CBOX17, MSR_UNC_V3_C17_PMON_CTL3, MSR_UNC_V3_C17_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC84, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC85, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC86, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"SBOX0C0", PMC87, SBOX0, MSR_UNC_V3_S0_PMON_CTL_0, MSR_UNC_V3_S0_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C1", PMC88, SBOX0, MSR_UNC_V3_S0_PMON_CTL_1, MSR_UNC_V3_S0_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C2", PMC89, SBOX0, MSR_UNC_V3_S0_PMON_CTL_2, MSR_UNC_V3_S0_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C3", PMC90, SBOX0, MSR_UNC_V3_S0_PMON_CTL_3, MSR_UNC_V3_S0_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C0", PMC91, SBOX1, MSR_UNC_V3_S1_PMON_CTL_0, MSR_UNC_V3_S1_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C1", PMC92, SBOX1, MSR_UNC_V3_S1_PMON_CTL_1, MSR_UNC_V3_S1_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C2", PMC93, SBOX1, MSR_UNC_V3_S1_PMON_CTL_2, MSR_UNC_V3_S1_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C3", PMC94, SBOX1, MSR_UNC_V3_S1_PMON_CTL_3, MSR_UNC_V3_S1_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C0", PMC95, SBOX2, MSR_UNC_V3_S2_PMON_CTL_0, MSR_UNC_V3_S2_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C1", PMC96, SBOX2, MSR_UNC_V3_S2_PMON_CTL_1, MSR_UNC_V3_S2_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C2", PMC97, SBOX2, MSR_UNC_V3_S2_PMON_CTL_2, MSR_UNC_V3_S2_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C3", PMC98, SBOX2, MSR_UNC_V3_S2_PMON_CTL_3, MSR_UNC_V3_S2_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C0", PMC99, SBOX3, MSR_UNC_V3_S3_PMON_CTL_0, MSR_UNC_V3_S3_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C1", PMC100, SBOX3, MSR_UNC_V3_S3_PMON_CTL_1, MSR_UNC_V3_S3_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C2", PMC101, SBOX3, MSR_UNC_V3_S3_PMON_CTL_2, MSR_UNC_V3_S3_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C3", PMC102, SBOX3, MSR_UNC_V3_S3_PMON_CTL_3, MSR_UNC_V3_S3_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"WBOX0", PMC103, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC104, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC105, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC106, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC107, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC108, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX2FIX", PMC109, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC2_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX3FIX", PMC110, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC3_CTR , 0, 0, EVENT_OPTION_NONE_MASK},
+    {"BBOX0C0", PMC111, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC112, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC113, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC114, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC115, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC116, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC117, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC118, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"MBOX0C0", PMC119, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C1", PMC120, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C2", PMC121, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C3", PMC122, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC123, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX1C0", PMC124, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C1", PMC125, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C2", PMC126, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C3", PMC127, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC128, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX2C0", PMC129, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C1", PMC130, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C2", PMC131, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C3", PMC132, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC133, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX3C0", PMC134, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C1", PMC135, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C2", PMC136, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C3", PMC137, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC138, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX4C0", PMC139, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C1", PMC140, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C2", PMC141, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C3", PMC142, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC43, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX5C0", PMC144, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C1", PMC145, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C2", PMC146, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C3", PMC147, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC148, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX6C0", PMC149, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C1", PMC150, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C2", PMC151, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C3", PMC152, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC153, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX7C0", PMC154, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C1", PMC155, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C2", PMC156, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C3", PMC157, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC158, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"IBOX0C0", PMC159, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC160, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC161, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC162, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"PBOX0", PMC163, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC164, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC165, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC166, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"RBOX0C0", PMC167, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC168, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC169, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC170, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC171, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC172, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"QBOX0C0", PMC173, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C1", PMC174, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C2", PMC175, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C3", PMC176, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C0", PMC177, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C1", PMC178, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C2", PMC179, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C3", PMC180, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0FIX0", PMC181, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX1", PMC182, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX2", PMC183, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX0", PMC184, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX1", PMC185, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX2", PMC186, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap haswellEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0,0,0,-1,0,0,8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [POWER] = {0,0,0,-1,0,0,32},
+    [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+    [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+    [CBOX16] = {MSR_UNC_V3_C16_PMON_BOX_CTL, MSR_UNC_V3_C16_PMON_BOX_STATUS, MSR_UNC_V3_C16_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C16_PMON_BOX_FILTER0, MSR_UNC_V3_C16_PMON_BOX_FILTER1},
+    [CBOX17] = {MSR_UNC_V3_C17_PMON_BOX_CTL, MSR_UNC_V3_C17_PMON_BOX_STATUS, MSR_UNC_V3_C17_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C17_PMON_BOX_FILTER0, MSR_UNC_V3_C17_PMON_BOX_FILTER1},
+    [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+    [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+    [SBOX0] = {MSR_UNC_V3_S0_PMON_BOX_CTL, MSR_UNC_V3_S0_PMON_BOX_STATUS, MSR_UNC_V3_S0_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX1] = {MSR_UNC_V3_S1_PMON_BOX_CTL, MSR_UNC_V3_S1_PMON_BOX_STATUS, MSR_UNC_V3_S1_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX2] = {MSR_UNC_V3_S2_PMON_BOX_CTL, MSR_UNC_V3_S2_PMON_BOX_STATUS, MSR_UNC_V3_S2_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX3] = {MSR_UNC_V3_S3_PMON_BOX_CTL, MSR_UNC_V3_S3_PMON_BOX_STATUS, MSR_UNC_V3_S3_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0,0,0,-1,0,0,64},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 22, 1, PCI_HA_DEVICE_1, 48},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX0FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX1FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX2FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX3FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX4FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX5FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX6FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [MBOX7FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 23, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 27, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+    [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+};
+
+static PciDevice haswellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "MSR", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x2F36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x2F37},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x2F34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x2FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x2FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x2FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x2FB1},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x2F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x2F38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x2FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x2FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x2FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x2FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", "IBOX", 0x2F39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "QBOX0", 0x2F32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "QBOX1", 0x2F33},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x2F86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x2F96},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "QBOX0FIX", 0x2F80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_1", "QBOX1FIX", 0x2F80},
+};
+
diff --git a/src/includes/perfmon_haswellEP_events.txt b/src/includes/perfmon_haswellEP_events.txt
new file mode 100644
index 0000000..fb078a1
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_events.txt
@@ -0,0 +1,2616 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_haswellEP_events.txt
+#
+#      Description:  Event list for Intel Haswell EP/EN/EX
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLK_UNHALTED_ANY       0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_LARGE  0x04
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M           0x40
+UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
+UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS        0x80
+
+EVENT_INT_MISC                  0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY  0x03
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
+UMASK_UOPS_ISSUED_SLOW_LEA       0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_ARITH_DIVIDER_UOPS            0x14 PMC
+UMASK_ARITH_DIVIDER_CYCLES          0x01
+UMASK_ARITH_DIVIDER_UOPS            0x02
+
+EVENT_L2_RQSTS                          0x24   PMC
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS  0x21
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT   0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD       0xE1
+UMASK_L2_RQSTS_RFO_HIT                  0x42
+UMASK_L2_RQSTS_RFO_MISS                 0x22
+UMASK_L2_RQSTS_ALL_RFO                  0xE2
+UMASK_L2_RQSTS_CODE_RD_HIT              0x44
+UMASK_L2_RQSTS_CODE_RD_MISS             0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS          0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES    0xE7
+UMASK_L2_RQSTS_ALL_CODE_RD              0xE4
+UMASK_L2_RQSTS_L2_PF_HIT                0x50
+UMASK_L2_RQSTS_L2_PF_MISS               0x30
+UMASK_L2_RQSTS_ALL_PF                   0xF8
+UMASK_L2_RQSTS_MISS                     0x3F
+UMASK_L2_RQSTS_REFERENCES               0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT            0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT            0x50
+
+EVENT_LONGEST_LAT_CACHE               0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE     0x4F
+UMASK_LONGEST_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY     0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
+
+EVENT_L1D_PEND_MISS              0x48   PMC2
+UMASK_L1D_PEND_MISS_PENDING      0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
+
+EVENT_L1D_PEND_MISS_REQUEST_FB_FULL 0x48 PMC
+UMASK_L1D_PEND_MISS_REQUEST_FB_FULL 0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_FB_FULL EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_FB_FULL         0x02
+
+
+EVENT_DTLB_STORE_MISSES                         0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK           0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K       0x02
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE    0x04
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED          0x0E
+UMASK_DTLB_STORE_MISSES_WALK_DURATION           0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K             0x20
+UMASK_DTLB_STORE_MISSES_STLB_HIT_2M             0x40
+UMASK_DTLB_STORE_MISSES_STLB_HIT                0x60
+UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS          0x80
+
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_EPT_WALK_CYCLES            0x4F PMC
+UMASK_EPT_WALK_CYCLES            0x10
+
+EVENT_L1D                         0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_TX_MEM                                        0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT                         0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE                   0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK         0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH      0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL                0x40
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES                   0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123           0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS       0x01
+
+EVENT_TX_EXEC                       0x5D PMC
+UMASK_TX_EXEC_MISC1                 0x01
+UMASK_TX_EXEC_MISC2                 0x02
+UMASK_TX_EXEC_MISC3                 0x04
+UMASK_TX_EXEC_MISC4                 0x08
+UMASK_TX_EXEC_MISC5                 0x10
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_RS_EVENTS_EMPTY_END       0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 0x01
+
+EVENT_LOCK_CYCLES                               0x63   PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION   0x01
+DEFAULT_OPTIONS_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT   0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION           0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT           0x02
+
+EVENT_IDQ                               0x79   PMC
+UMASK_IDQ_EMPTY                         0x02
+UMASK_IDQ_MITE_UOPS                     0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES                   0x04
+UMASK_IDQ_DSB_UOPS                      0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES          EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES                    0x08
+UMASK_IDQ_MS_DSB_UOPS                   0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES       EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES                 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR        EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR                  0x10
+UMASK_IDQ_MS_MITE_UOPS                  0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES      EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_MITE_CYCLES                0x20
+UMASK_IDQ_MS_UOPS                       0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES           EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES                     0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES         EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES                   0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS       0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS         0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS  EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS      0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS        0x24
+UMASK_IDQ_MITE_ALL_UOPS                 0x3C
+
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HIT                0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                       0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK         0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_ITLB_MISSES_WALK_COMPLETED_LARGE  0x04
+UMASK_ITLB_MISSES_WALK_COMPLETED        0x0E
+UMASK_ITLB_MISSES_WALK_DURATION         0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K           0x20
+UMASK_ITLB_MISSES_STLB_HIT_2M           0x40
+UMASK_ITLB_MISSES_STLB_HIT              0x60
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP                       0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL                 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN             0xC8
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE      0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE      0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE      0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE      0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE      0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE      0x80
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS      0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_DATA_PORTS       0x9C
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY                 0xA3   PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_PENDING  EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING            0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY     EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY               0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE  EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE            0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_PENDING  EVENT_OPTION_THRESHOLD=0x5
+UMASK_CYCLE_ACTIVITY_STALLS_L2_PENDING            0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY     EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY               0x06
+
+EVENT_CYCLE_ACTIVITY_CYCLES                 0xA3   PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_PENDING EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING     0x08
+
+EVENT_CYCLE_ACTIVITY_STALLS                 0xA3   PMC2
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L1D_PENDING EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING     0x0C
+
+EVENT_LSD_UOPS                  0xA8   PMC
+UMASK_LSD_UOPS                  0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS         0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE         0x01
+
+EVENT_DSB2MITE_SWITCHES                0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT          0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_ITLB                          0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH               0x01
+
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                  0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY     0x18
+UMASK_PAGE_WALKER_LOADS_ITLB_MEMORY     0x28
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L1 0x41
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L1 0x81
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L2 0x42
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L2 0x82
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L3 0x44
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L3 0x84
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_MEMORY 0x48
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_MEMORY 0x88
+
+EVENT_TLB_FLUSH                 0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
+EVENT_INST_RETIRED_ANY              0xC0  PMC
+UMASK_INST_RETIRED_ANY_P            0x00
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST         0x40
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL         0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN     0x20
+
+EVENT_AVX_INSTS                     0xC6 PMC
+UMASK_AVX_INSTS_LOADS               0x01
+UMASK_AVX_INSTS_STORES              0x02
+UMASK_AVX_INSTS_CALC                0x04
+UMASK_AVX_INSTS_ALL                 0x07
+
+EVENT_HLE_RETIRED                    0xC8 PMC
+UMASK_HLE_RETIRED_START              0x01
+UMASK_HLE_RETIRED_COMMIT             0x02
+UMASK_HLE_RETIRED_ABORTED            0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1      0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2      0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3      0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4      0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_RTM_RETIRED                    0xC9 PMC
+UMASK_RTM_RETIRED_START              0x01
+UMASK_RTM_RETIRED_COMMIT             0x02
+UMASK_RTM_RETIRED_ABORTED            0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1      0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2      0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3      0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4      0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5      0x80
+
+
+EVENT_FP_ASSIST                   0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT        0x02
+UMASK_FP_ASSIST_X87_INPUT         0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT       0x08
+UMASK_FP_ASSIST_SIMD_INPUT        0x10
+DEFAULT_OPTIONS_FP_ASSIST_ANY     EVENT_OPTION_THRESHOLD=0x1
+UMASK_FP_ASSIST_ANY               0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+EVENT_MEM_UOPS_RETIRED                         0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS                   0x81
+UMASK_MEM_UOPS_RETIRED_STORES                  0x82
+UMASK_MEM_UOPS_RETIRED_ALL                     0x83
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS         0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS        0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK              0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK             0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT             0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT            0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED              0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_MISS     0x38
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_HIT      0x07
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_ALL      0x3F
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED                   0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED                 0xD3   PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM      0x01
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_DRAM     0x04
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM     0x10
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_FWD      0x20
+
+
+EVENT_BACLEARS               0xE6   PMC
+UMASK_BACLEARS_ANY           0x1F
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PF         0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN                   0xF1   PMC
+UMASK_L2_LINES_IN_I                 0x01
+UMASK_L2_LINES_IN_S                 0x02
+UMASK_L2_LINES_IN_E                 0x04
+UMASK_L2_LINES_IN_ALL               0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS               0x00 CBOX
+UMASK_CBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_INSERTS                   0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE          0x01
+UMASK_TXR_INSERTS_AK_CACHE          0x02
+UMASK_TXR_INSERTS_BL_CACHE          0x04
+UMASK_TXR_INSERTS_IV_CACHE          0x08
+UMASK_TXR_INSERTS_AD_CORE           0x10
+UMASK_TXR_INSERTS_AK_CORE           0x20
+UMASK_TXR_INSERTS_BL_CORE           0x40
+
+EVENT_TXR_ADS_USED                  0x04 CBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_RING_BOUNCES                  0x05 CBOX
+UMASK_RING_BOUNCES_AD               0x01
+UMASK_RING_BOUNCES_AK               0x02
+UMASK_RING_BOUNCES_BL               0x04
+UMASK_RING_BOUNCES_IV               0x08
+
+EVENT_RING_SRC_THRTL                0x07 CBOX
+UMASK_RING_SRC_THRTL                0x00
+
+EVENT_FAST_ASSERTED                 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1|CBOX16C0|CBOX16C1|CBOX17C0|CBOX17C1
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0xA CBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_RXR_OCCUPANCY_IRQ             0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ         0x02
+UMASK_RXR_OCCUPANCY_IPQ             0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ         0x20
+UMASK_RXR_OCCUPANCY_IRQ_IPQ         0x05
+UMASK_RXR_OCCUPANCY_IRQ_PRQ_REJ     0x21
+UMASK_RXR_OCCUPANCY_IPQ_PRQ_REJ     0x24
+
+EVENT_RXR_EXT_STARVED               0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                   0x13 CBOX
+UMASK_RXR_INSERTS_IRQ               0x01
+UMASK_RXR_INSERTS_IRQ_REJ           0x02
+UMASK_RXR_INSERTS_IPQ               0x04
+UMASK_RXR_INSERTS_PRQ               0x10
+UMASK_RXR_INSERTS_PRQ_REJ           0x20
+
+EVENT_RING_AD_USED                  0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+UMASK_RING_AD_USED_ANY              0x0F
+
+EVENT_RING_AK_USED                  0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+UMASK_RING_AK_USED_ANY              0x0F
+
+EVENT_RING_BL_USED                  0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+UMASK_RING_BL_USED_ANY              0x0F
+
+EVENT_RING_IV_USED                  0x1E CBOX
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DN               0x0C
+UMASK_RING_IV_USED_ANY              0x0F
+
+
+EVENT_COUNTER0_OCCUPANCY            0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY            0x00
+
+EVENT_RXR_IPQ_RETRY2                0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO         0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IRQ_RETRY2                0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_ISMQ_RETRY2               0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IPQ_RETRY                 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY             0x01
+UMASK_RXR_IPQ_RETRY_FULL            0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS     0x10
+
+EVENT_RXR_IRQ_RETRY                 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY             0x01
+UMASK_RXR_IRQ_RETRY_FULL            0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IRQ_RETRY_RTID            0x01
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS     0x01
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS     0x01
+OPTIONS_RXR_IRQ_RETRY_NID           EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID             0x01
+
+EVENT_RXR_ISMQ_RETRY                0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS    0x20
+OPTIONS_RXR_ISMQ_RETRY_NID          EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID            0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS   EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_LLC_LOOKUP                    0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_READ             EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ               0x21
+OPTIONS_LLC_LOOKUP_NID_MASK         EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_TOR_INSERTS                   0x35 CBOX
+UMASK_TOR_INSERTS_ALL               0x08
+UMASK_TOR_INSERTS_WB                0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE    EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE      0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL             0x28
+UMASK_TOR_INSERTS_MISS_LOCAL        0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE      EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE        0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE   0x43
+OPTIONS_TOR_INSERTS_NID_EVICION     EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION       0x44
+OPTIONS_TOR_INSERTS_NID_ALL         EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL           0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL    EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL      0x4A
+OPTIONS_TOR_INSERTS_NID_WB          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB            0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE     0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE            0x88
+UMASK_TOR_INSERTS_MISS_REMOTE       0x8A
+
+EVENT_TOR_OCCUPANCY                 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+OPTIONS_TOR_OCCUPANCY_OPCODE        EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE          0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE     0x03
+UMASK_TOR_OCCUPANCY_EVICTION        0x04
+UMASK_TOR_OCCUPANCY_ALL             0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL        0x0A
+UMASK_TOR_OCCUPANCY_WB              0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE    0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL           0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL      0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE    EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE      0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION    0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL       EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL         0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL    0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB        EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB          0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE   0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE          0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE     0x8A
+
+EVENT_LLC_VICTIMS                   0x37 CBOX
+UMASK_LLC_VICTIMS_M                 0x01
+UMASK_LLC_VICTIMS_E                 0x02
+UMASK_LLC_VICTIMS_I                 0x04
+UMASK_LLC_VICTIMS_F                 0x08
+UMASK_LLC_VICTIMS_MEIF              0x0F
+UMASK_LLC_VICTIMS_MISS              0x10
+OPTIONS_LLC_VICTIMS_NID             EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID               0x40
+
+EVENT_MISC                          0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE             0x01
+UMASK_MISC_WC_ALIASING              0x02
+UMASK_MISC_STARTED                  0x04
+UMASK_MISC_RFO_HIT_S                0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM   0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS     0x20
+
+EVENT_SBO_CREDITS_ACQUIRED          0x3D CBOX
+UMASK_SBO_CREDITS_ACQUIRED_AD       0x01
+UMASK_SBO_CREDITS_ACQUIRED_BL       0x02
+
+EVENT_SBO_CREDIT_OCCUPANCY          0x3E CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_SBO_CREDIT_OCCUPANCY_AD       0x01
+UMASK_SBO_CREDIT_OCCUPANCY_BL       0x02
+
+EVENT_EVENT_MSG                     0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD       0x08
+
+EVENT_PHOLD_CYCLES                  0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK    0x01
+
+EVENT_RACU_REQUESTS                 0x46 UBOX
+UMASK_RACU_REQUESTS                 0x00
+
+EVENT_UNCORE_CLOCK                  0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
+
+EVENT_SBOX_CLOCKTICKS               0x00 SBOX
+UMASK_SBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_OCCUPANCY                 0x01 SBOX
+UMASK_TXR_OCCUPANCY_AD_CRD          0x01
+UMASK_TXR_OCCUPANCY_AD_BNC          0x02
+UMASK_TXR_OCCUPANCY_BL_CRD          0x04
+UMASK_TXR_OCCUPANCY_BL_BNC          0x08
+UMASK_TXR_OCCUPANCY_AK              0x10
+UMASK_TXR_OCCUPANCY_IV              0x20
+
+EVENT_TXR_INSERTS                   0x02 SBOX
+UMASK_TXR_INSERTS_AD_CRD            0x01
+UMASK_TXR_INSERTS_AD_BNC            0x02
+UMASK_TXR_INSERTS_BL_CRD            0x04
+UMASK_TXR_INSERTS_BL_BNC            0x08
+UMASK_TXR_INSERTS_AK                0x10
+UMASK_TXR_INSERTS_IV                0x20
+
+EVENT_TXR_ADS_USED                  0x04 SBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_RING_BOUNCES                  0x05 SBOX
+UMASK_RING_BOUNCES_AD_CACHE         0x01
+UMASK_RING_BOUNCES_AK_CORE          0x02
+UMASK_RING_BOUNCES_BL_CORE          0x04
+UMASK_RING_BOUNCES_IV_CORE          0x08
+
+EVENT_FAST_ASSERTED                 0x09 SBOX
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0x0A SBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 SBOX
+UMASK_RXR_OCCUPANCY_AD_CRD          0x01
+UMASK_RXR_OCCUPANCY_AD_BNC          0x02
+UMASK_RXR_OCCUPANCY_BL_CRD          0x04
+UMASK_RXR_OCCUPANCY_BL_BNC          0x08
+UMASK_RXR_OCCUPANCY_AK              0x10
+UMASK_RXR_OCCUPANCY_IV              0x20
+
+EVENT_RXR_BYPASS                    0x12 SBOX
+UMASK_RXR_BYPASS_AD_CRD             0x01
+UMASK_RXR_BYPASS_AD_BNC             0x02
+UMASK_RXR_BYPASS_BL_CRD             0x04
+UMASK_RXR_BYPASS_BL_BNC             0x08
+UMASK_RXR_BYPASS_AK                 0x10
+UMASK_RXR_BYPASS_IV                 0x20
+
+EVENT_RxR_INSERTS                   0x13 SBOX
+UMASK_RXR_INSERTS_AD_CRD            0x01
+UMASK_RXR_INSERTS_AD_BNC            0x02
+UMASK_RXR_INSERTS_BL_CRD            0x04
+UMASK_RXR_INSERTS_BL_BNC            0x08
+UMASK_RXR_INSERTS_AK                0x10
+UMASK_RXR_INSERTS_IV                0x20
+
+EVENT_RING_AD_USED                  0x1B SBOX
+UMASK_RING_AD_USED_ANY              0x0F
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+
+EVENT_RING_AK_USED                  0x1C SBOX
+UMASK_RING_AK_USED_ANY              0x0F
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+
+EVENT_RING_BL_USED                  0x1D SBOX
+UMASK_RING_BL_USED_ANY              0x0F
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+
+EVENT_RING_IV_USED                  0x1E SBOX
+UMASK_RING_IV_USED_ANY              0x0F
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DOWN             0x0C
+
+EVENT_WBOX_CLOCKTICKS               0x00 WBOX
+UMASK_WBOX_CLOCKTICKS               0x00
+
+EVENT_CORE0_TRANSITION_CYCLES       0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES       0x00
+
+EVENT_CORE1_TRANSITION_CYCLES       0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES       0x00
+
+EVENT_CORE2_TRANSITION_CYCLES       0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES       0x00
+
+EVENT_CORE3_TRANSITION_CYCLES       0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES       0x00
+
+EVENT_CORE4_TRANSITION_CYCLES       0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES       0x00
+
+EVENT_CORE5_TRANSITION_CYCLES       0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES       0x00
+
+EVENT_CORE6_TRANSITION_CYCLES       0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES       0x00
+
+EVENT_CORE7_TRANSITION_CYCLES       0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES       0x00
+
+EVENT_CORE8_TRANSITION_CYCLES       0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES       0x00
+
+EVENT_CORE9_TRANSITION_CYCLES       0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES       0x00
+
+EVENT_CORE10_TRANSITION_CYCLES       0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES       0x00
+
+EVENT_CORE11_TRANSITION_CYCLES       0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES       0x00
+
+EVENT_CORE12_TRANSITION_CYCLES       0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES       0x00
+
+EVENT_CORE13_TRANSITION_CYCLES       0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES       0x00
+
+EVENT_CORE14_TRANSITION_CYCLES       0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES       0x00
+
+EVENT_CORE15_TRANSITION_CYCLES       0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES       0x00
+
+EVENT_CORE16_TRANSITION_CYCLES       0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES       0x00
+
+EVENT_CORE17_TRANSITION_CYCLES       0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES       0x00
+
+EVENT_DEMOTIONS_CORE0                0x30 WBOX
+UMASK_DEMOTIONS_CORE0                0x00
+
+EVENT_DEMOTIONS_CORE1                0x31 WBOX
+UMASK_DEMOTIONS_CORE1                0x00
+
+EVENT_DEMOTIONS_CORE2                0x32 WBOX
+UMASK_DEMOTIONS_CORE2                0x00
+
+EVENT_DEMOTIONS_CORE3                0x33 WBOX
+UMASK_DEMOTIONS_CORE3                0x00
+
+EVENT_DEMOTIONS_CORE4                0x34 WBOX
+UMASK_DEMOTIONS_CORE4                0x00
+
+EVENT_DEMOTIONS_CORE5                0x35 WBOX
+UMASK_DEMOTIONS_CORE5                0x00
+
+EVENT_DEMOTIONS_CORE6                0x36 WBOX
+UMASK_DEMOTIONS_CORE6                0x00
+
+EVENT_DEMOTIONS_CORE7                0x37 WBOX
+UMASK_DEMOTIONS_CORE7                0x00
+
+EVENT_DEMOTIONS_CORE8                0x38 WBOX
+UMASK_DEMOTIONS_CORE8                0x00
+
+EVENT_DEMOTIONS_CORE9                0x39 WBOX
+UMASK_DEMOTIONS_CORE9                0x00
+
+EVENT_DEMOTIONS_CORE10                0x3A WBOX
+UMASK_DEMOTIONS_CORE10                0x00
+
+EVENT_DEMOTIONS_CORE11                0x3B WBOX
+UMASK_DEMOTIONS_CORE11                0x00
+
+EVENT_DEMOTIONS_CORE12                0x3C WBOX
+UMASK_DEMOTIONS_CORE12                0x00
+
+EVENT_DEMOTIONS_CORE13                0x3D WBOX
+UMASK_DEMOTIONS_CORE13                0x00
+
+EVENT_DEMOTIONS_CORE14                0x3E WBOX
+UMASK_DEMOTIONS_CORE14                0x00
+
+EVENT_DEMOTIONS_CORE15                0x3F WBOX
+UMASK_DEMOTIONS_CORE15                0x00
+
+EVENT_DEMOTIONS_CORE16                0x40 WBOX
+UMASK_DEMOTIONS_CORE16                0x00
+
+EVENT_DEMOTIONS_CORE17                0x41 WBOX
+UMASK_DEMOTIONS_CORE17                0x00
+
+EVENT_FREQ_BAND0_CYCLES                 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES                 0x00
+
+EVENT_FREQ_BAND1_CYCLES                 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES                 0x00
+
+EVENT_FREQ_BAND2_CYCLES                 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES                 0x00
+
+EVENT_FREQ_BAND3_CYCLES                 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES                 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x00
+
+EVENT_FREQ_MAX_OS_CYCLES                0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES                0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES             0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES             0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES              0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES              0x00
+
+EVENT_FREQ_TRANS_CYCLES                 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES                 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES      0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES      0x00
+
+EVENT_PKG_RESIDENCY_C0_CYCLES           0x2A WBOX
+UMASK_PKG_RESIDENCY_C0_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C1E_CYCLES          0x4E WBOX
+UMASK_PKG_RESIDENCY_C1E_CYCLES          0x00
+
+EVENT_PKG_RESIDENCY_C2E_CYCLES          0x2B WBOX
+UMASK_PKG_RESIDENCY_C2E_CYCLES          0x00
+
+EVENT_PKG_RESIDENCY_C3_CYCLES           0x2C WBOX
+UMASK_PKG_RESIDENCY_C3_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C6_CYCLES           0x2D WBOX
+UMASK_PKG_RESIDENCY_C6_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C7_CYCLES           0x2E WBOX
+UMASK_PKG_RESIDENCY_C7_CYCLES           0x00
+
+EVENT_POWER_STATE_OCCUPANCY             0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0    0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3    0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6    0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_UFS_TRANSITIONS_RING_GV           0x79 WBOX
+UMASK_UFS_TRANSITIONS_RING_GV           0x00
+
+EVENT_VR_HOT_CYCLES                     0x42 WBOX
+UMASK_VR_HOT_CYCLES                     0x00
+
+EVENT_CORE_CORE_C6_RESIDENCY             0x00 WBOX0FIX
+UMASK_CORE_CORE_C6_RESIDENCY             0x00
+
+EVENT_CORE_CORE_C3_RESIDENCY             0x00 WBOX1FIX
+UMASK_CORE_CORE_C3_RESIDENCY             0x00
+
+EVENT_CORE_PKG_C2_RESIDENCY              0x00 WBOX2FIX
+EVENT_CORE_PKG_C2_RESIDENCY              0x00
+
+EVENT_CORE_PKG_C3_RESIDENCY              0x00 WBOX3FIX
+UMASK_CORE_PKG_C3_RESIDENCY              0x00
+
+EVENT_BBOX_CLOCKTICKS                   0x00 BBOX
+UMASK_BBOX_CLOCKTICKS                   0x00
+
+EVENT_ADDR_OPC_MATCH                    0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR               0x01
+OPTIONS_ADDR_OPC_MATCH_OPC              EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC                0x02
+OPTIONS_ADDR_OPC_MATCH_FILT             EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT               0x03
+OPTIONS_ADDR_OPC_MATCH_AD               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD                 0x04
+OPTIONS_ADDR_OPC_MATCH_BL               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL                 0x08
+OPTIONS_ADDR_OPC_MATCH_AK               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK                 0x10
+
+EVENT_BT_CYCLES_NE                      0x42 BBOX
+UMASK_BT_CYCLES_NE                      0x00
+
+EVENT_BT_OCCUPANCY                      0x43 BBOX
+UMASK_BT_OCCUPANCY                      0x00
+
+EVENT_BYPASS_IMC                        0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN                  0x01
+UMASK_BYPASS_IMC_NOT_TAKEN              0x02
+
+EVENT_CONFLICT_CYCLES                   0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES                   0x00
+
+EVENT_DIRECT2CORE_COUNT                 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT                 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED       0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED       0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE          0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE          0x00
+
+EVENT_DIRECTORY_LAT_OPT                 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT                 0x00
+
+EVENT_DIRECTORY_LOOKUP                  0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP              0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP           0x02
+
+EVENT_DIRECTORY_UPDATE                  0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET              0x01
+UMASK_DIRECTORY_UPDATE_CLEAR            0x02
+UMASK_DIRECTORY_UPDATE_ANY              0x03
+
+EVENT_HITME_LOOKUP                      0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE         0x01
+UMASK_HITME_LOOKUP_WBMTOI                  0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI             0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S             0x08
+UMASK_HITME_LOOKUP_HOM                     0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE          0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL           0x20
+UMASK_HITME_LOOKUP_INVALS                  0x26
+UMASK_HITME_LOOKUP_RSPFWDS                 0x40
+UMASK_HITME_LOOKUP_EVICTS                  0x42
+UMASK_HITME_LOOKUP_ALLOCS                  0x70
+UMASK_HITME_LOOKUP_RSP                     0x80
+UMASK_HITME_LOOKUP_ALL                     0xFF
+
+EVENT_HITME_HIT                         0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE         0x01
+UMASK_HITME_HIT_WBMTOI                  0x02
+UMASK_HITME_HIT_ACKCNFLTWBI             0x04
+UMASK_HITME_HIT_WBMTOE_OR_S             0x08
+UMASK_HITME_HIT_HOM                     0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE          0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL           0x20
+UMASK_HITME_HIT_INVALS                  0x26
+UMASK_HITME_HIT_RSPFWDS                 0x40
+UMASK_HITME_HIT_EVICTS                  0x42
+UMASK_HITME_HIT_ALLOCS                  0x70
+UMASK_HITME_HIT_RSP                     0x80
+UMASK_HITME_HIT_ALL                     0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET             0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI          0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI     0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S     0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM             0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE  0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL   0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS         0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP             0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL             0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES              0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0      0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1      0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0      0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1      0x08
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2      0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2      0x20
+
+EVENT_IMC_READS                         0x17 BBOX
+UMASK_IMC_READS_NORMAL                  0x01
+
+EVENT_IMC_RETRY                         0x1E BBOX
+UMASK_IMC_RETRY                         0x00
+
+EVENT_IMC_WRITES                        0x1A BBOX
+UMASK_IMC_WRITES_FULL                   0x01
+UMASK_IMC_WRITES_PARTIAL                0x02
+UMASK_IMC_WRITES_FULL_ISOCH             0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH          0x08
+UMASK_IMC_WRITES_ALL                    0x0F
+
+EVENT_OSB                               0x53 BBOX
+UMASK_OSB_READS_LOCAL                   0x02
+UMASK_OSB_INVITOE_LOCAL                 0x04
+UMASK_OSB_REMOTE                        0x08
+UMASK_OSB_CANCELLED                     0x10
+UMASK_OSB_READS_LOCAL_USEFUL            0x20
+UMASK_OSB_REMOTE_USEFUL                 0x40
+
+EVENT_OSB_EDR                           0x54 BBOX
+UMASK_OSB_EDR_ALL                       0x01
+UMASK_OSB_EDR_READS_LOCAL_I             0x02
+UMASK_OSB_EDR_READS_REMOTE_I            0x04
+UMASK_OSB_EDR_READS_LOCAL_S             0x08
+UMASK_OSB_EDR_READS_REMOTE_S            0x10
+
+EVENT_REQUESTS                          0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL              0x01
+UMASK_REQUESTS_READS_REMOTE             0x02
+UMASK_REQUESTS_READS                    0x03
+UMASK_REQUESTS_WRITES_LOCAL             0x04
+UMASK_REQUESTS_WRITES_REMOTE            0x08
+UMASK_REQUESTS_WRITES                   0x0C
+UMASK_REQUESTS_INVITOE_LOCAL            0x10
+UMASK_REQUESTS_INVITOE_REMOTE           0x20
+
+EVENT_RING_AD_USED                      0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x68 BBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED             0x69 BBOX
+UMASK_SBO1_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO0_CREDITS_OCCUPANCY            0x6A BBOX
+UMASK_SBO0_CREDITS_OCCUPANCY_AD         0x01
+UMASK_SBO0_CREDITS_OCCUPANCY_BL         0x02
+
+EVENT_SBO1_CREDITS_OCCUPANCY            0x6B BBOX
+UMASK_SBO1_CREDITS_OCCUPANCY_AD         0x01
+UMASK_SBO1_CREDITS_OCCUPANCY_BL         0x02
+
+EVENT_SNOOPS_RSP_AFTER_DATA             0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL       0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE      0x02
+
+EVENT_SNOOP_CYCLES_NE                   0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL             0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE            0x02
+UMASK_SNOOP_CYCLES_NE_ALL               0x03
+
+EVENT_SNOOP_OCCUPANCY                   0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL             0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE            0x02
+
+EVENT_SNOOP_RESP                        0x21 BBOX
+UMASK_SNOOP_RESP_RSPI                   0x01
+UMASK_SNOOP_RESP_RSPS                   0x02
+UMASK_SNOOP_RESP_RSPIFWD                0x04
+UMASK_SNOOP_RESP_RSPSFWD                0x08
+UMASK_SNOOP_RESP_RSP_WB                 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB             0x20
+UMASK_SNOOP_RESP_RSPCNFLCT              0x40
+
+EVENT_SNP_RESP_RECV_LOCAL               0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI          0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS          0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD       0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD       0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB        0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB    0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT     0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER         0x80
+
+EVENT_STALL_NO_SBO_CREDIT               0x6C BBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_TAD_REQUESTS_G0                   0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0           0x01
+UMASK_TAD_REQUESTS_G0_REGION1           0x02
+UMASK_TAD_REQUESTS_G0_REGION2           0x04
+UMASK_TAD_REQUESTS_G0_REGION3           0x08
+UMASK_TAD_REQUESTS_G0_REGION4           0x10
+UMASK_TAD_REQUESTS_G0_REGION5           0x20
+UMASK_TAD_REQUESTS_G0_REGION6           0x40
+UMASK_TAD_REQUESTS_G0_REGION7           0x60
+
+EVENT_TAD_REQUESTS_G1                   0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8           0x01
+UMASK_TAD_REQUESTS_G1_REGION9           0x02
+UMASK_TAD_REQUESTS_G1_REGION10          0x04
+UMASK_TAD_REQUESTS_G1_REGION11          0x08
+
+EVENT_TRACKER_CYCLES_FULL               0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP            0x01
+UMASK_TRACKER_CYCLES_FULL_ALL           0x02
+
+EVENT_TRACKER_CYCLES_NE                 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL           0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE          0x02
+UMASK_TRACKER_CYCLES_NE_ALL             0x03
+
+EVENT_TRACKER_OCCUPANCY                 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL     0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE    0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL    0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE   0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL   0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE  0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY         0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL   0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE  0x02
+
+EVENT_TXR_AD_CYCLES_FULL                0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_AK                            0x0E BBOX
+UMASK_TXR_AK                            0x00
+
+EVENT_TXR_AK_CYCLES_FULL                0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL                            0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE                  0x01
+UMASK_TXR_BL_DRS_CORE                   0x02
+UMASK_TXR_BL_DRS_QPI                    0x04
+
+EVENT_TXR_BL_CYCLES_FULL                0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL_OCCUPANCY                  0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY                  0x00
+
+EVENT_TXR_STARVED                       0x6D BBOX
+UMASK_TXR_STARVED_AK                    0x01
+UMASK_TXR_STARVED_BL                    0x02
+
+EVENT_DRAM_CLOCKTICKS                   0x00 MBOX
+UMASK_DRAM_CLOCKTICKS                   0x00
+
+EVENT_ACT_COUNT                         0x01 MBOX
+UMASK_ACT_COUNT_RD                      0x01
+UMASK_ACT_COUNT_WR                      0x02
+UMASK_ACT_COUNT_BYP                     0x08
+
+EVENT_BYP_CMDS                          0xA1 MBOX
+UMASK_BYP_CMDS_ACT                      0x01
+UMASK_BYP_CMDS_CAS                      0x02
+UMASK_BYP_CMDS_PRE                      0x04
+
+EVENT_CAS_COUNT                         0x04 MBOX
+UMASK_CAS_COUNT_RD_REG                  0x01
+UMASK_CAS_COUNT_RD_UNDERFILL            0x02
+UMASK_CAS_COUNT_RD                      0x03
+UMASK_CAS_COUNT_RD_WMM                  0x10
+UMASK_CAS_COUNT_RD_RMM                  0x20
+UMASK_CAS_COUNT_WR_WMM                  0x04
+UMASK_CAS_COUNT_WR_RMM                  0x08
+UMASK_CAS_COUNT_WR                      0x0C
+UMASK_CAS_COUNT_ALL                     0x0F
+
+EVENT_DRAM_PRE_ALL                      0x06 MBOX
+UMASK_DRAM_PRE_ALL                      0x00
+
+EVENT_DRAM_REFRESH                      0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC                0x02
+UMASK_DRAM_REFRESH_HIGH                 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS            0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS            0x00
+
+EVENT_MAJOR_MODES                       0x07 MBOX
+UMASK_MAJOR_MODES_READ                  0x01
+UMASK_MAJOR_MODES_WRITE                 0x02
+UMASK_MAJOR_MODES_PARTIAL               0x03
+UMASK_MAJOR_MODES_ISOCH                 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF              0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF              0x00
+
+EVENT_POWER_CHANNEL_PPD                 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD                 0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES    0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES    0x00
+
+EVENT_POWER_PCU_THROTTLING              0x42 MBOX
+UMASK_POWER_PCU_THROTTLING              0x00
+
+EVENT_POWER_SELF_REFRESH                0x43 MBOX
+UMASK_POWER_SELF_REFRESH                0x00
+
+EVENT_POWER_THROTTLE_CYCLES             0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0       0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1       0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2       0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3       0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4       0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5       0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6       0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7       0x80
+
+EVENT_PREEMPTION                        0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD          0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR          0x02
+
+EVENT_PRE_COUNT                         0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS               0x01
+UMASK_PRE_COUNT_PAGE_CLOSE              0x02
+UMASK_PRE_COUNT_RD                      0x04
+UMASK_PRE_COUNT_WR                      0x08
+UMASK_PRE_COUNT_BYP                     0x10
+
+EVENT_RD_CAS_PRIO                       0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW                   0x01
+UMASK_RD_CAS_PRIO_MED                   0x02
+UMASK_RD_CAS_PRIO_HIGH                  0x04
+UMASK_RD_CAS_PRIO_PANIC                 0x08
+
+EVENT_RD_CAS_RANK0                      0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0                0x00
+UMASK_RD_CAS_RANK0_BANK1                0x01
+UMASK_RD_CAS_RANK0_BANK2                0x02
+UMASK_RD_CAS_RANK0_BANK3                0x03
+UMASK_RD_CAS_RANK0_BANK4                0x04
+UMASK_RD_CAS_RANK0_BANK5                0x05
+UMASK_RD_CAS_RANK0_BANK6                0x06
+UMASK_RD_CAS_RANK0_BANK7                0x07
+UMASK_RD_CAS_RANK0_BANK8                0x08
+UMASK_RD_CAS_RANK0_BANK9                0x09
+UMASK_RD_CAS_RANK0_BANK10               0x0A
+UMASK_RD_CAS_RANK0_BANK11               0x0B
+UMASK_RD_CAS_RANK0_BANK12               0x0C
+UMASK_RD_CAS_RANK0_BANK13               0x0D
+UMASK_RD_CAS_RANK0_BANK14               0x0E
+UMASK_RD_CAS_RANK0_BANK15               0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS             0x10
+UMASK_RD_CAS_RANK0_BANKG0               0x11
+UMASK_RD_CAS_RANK0_BANKG1               0x12
+UMASK_RD_CAS_RANK0_BANKG2               0x13
+UMASK_RD_CAS_RANK0_BANKG3               0x14
+
+EVENT_RD_CAS_RANK1                      0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0                0x00
+UMASK_RD_CAS_RANK1_BANK1                0x01
+UMASK_RD_CAS_RANK1_BANK2                0x02
+UMASK_RD_CAS_RANK1_BANK3                0x03
+UMASK_RD_CAS_RANK1_BANK4                0x04
+UMASK_RD_CAS_RANK1_BANK5                0x05
+UMASK_RD_CAS_RANK1_BANK6                0x06
+UMASK_RD_CAS_RANK1_BANK7                0x07
+UMASK_RD_CAS_RANK1_BANK8                0x08
+UMASK_RD_CAS_RANK1_BANK9                0x09
+UMASK_RD_CAS_RANK1_BANK10               0x0A
+UMASK_RD_CAS_RANK1_BANK11               0x0B
+UMASK_RD_CAS_RANK1_BANK12               0x0C
+UMASK_RD_CAS_RANK1_BANK13               0x0D
+UMASK_RD_CAS_RANK1_BANK14               0x0E
+UMASK_RD_CAS_RANK1_BANK15               0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS             0x10
+UMASK_RD_CAS_RANK1_BANKG0               0x11
+UMASK_RD_CAS_RANK1_BANKG1               0x12
+UMASK_RD_CAS_RANK1_BANKG2               0x13
+UMASK_RD_CAS_RANK1_BANKG3               0x14
+
+EVENT_RD_CAS_RANK2                      0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0                0x00
+UMASK_RD_CAS_RANK2_BANK1                0x01
+UMASK_RD_CAS_RANK2_BANK2                0x02
+UMASK_RD_CAS_RANK2_BANK3                0x03
+UMASK_RD_CAS_RANK2_BANK4                0x04
+UMASK_RD_CAS_RANK2_BANK5                0x05
+UMASK_RD_CAS_RANK2_BANK6                0x06
+UMASK_RD_CAS_RANK2_BANK7                0x07
+UMASK_RD_CAS_RANK2_BANK8                0x08
+UMASK_RD_CAS_RANK2_BANK9                0x09
+UMASK_RD_CAS_RANK2_BANK10               0x0A
+UMASK_RD_CAS_RANK2_BANK11               0x0B
+UMASK_RD_CAS_RANK2_BANK12               0x0C
+UMASK_RD_CAS_RANK2_BANK13               0x0D
+UMASK_RD_CAS_RANK2_BANK14               0x0E
+UMASK_RD_CAS_RANK2_BANK15               0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS             0x10
+UMASK_RD_CAS_RANK2_BANKG0               0x11
+UMASK_RD_CAS_RANK2_BANKG1               0x12
+UMASK_RD_CAS_RANK2_BANKG2               0x13
+UMASK_RD_CAS_RANK2_BANKG3               0x14
+
+EVENT_RD_CAS_RANK3                      0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0                0x00
+UMASK_RD_CAS_RANK3_BANK1                0x01
+UMASK_RD_CAS_RANK3_BANK2                0x02
+UMASK_RD_CAS_RANK3_BANK3                0x03
+UMASK_RD_CAS_RANK3_BANK4                0x04
+UMASK_RD_CAS_RANK3_BANK5                0x05
+UMASK_RD_CAS_RANK3_BANK6                0x06
+UMASK_RD_CAS_RANK3_BANK7                0x07
+UMASK_RD_CAS_RANK3_BANK8                0x08
+UMASK_RD_CAS_RANK3_BANK9                0x09
+UMASK_RD_CAS_RANK3_BANK10               0x0A
+UMASK_RD_CAS_RANK3_BANK11               0x0B
+UMASK_RD_CAS_RANK3_BANK12               0x0C
+UMASK_RD_CAS_RANK3_BANK13               0x0D
+UMASK_RD_CAS_RANK3_BANK14               0x0E
+UMASK_RD_CAS_RANK3_BANK15               0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS             0x10
+UMASK_RD_CAS_RANK3_BANKG0               0x11
+UMASK_RD_CAS_RANK3_BANKG1               0x12
+UMASK_RD_CAS_RANK3_BANKG2               0x13
+UMASK_RD_CAS_RANK3_BANKG3               0x14
+
+EVENT_RD_CAS_RANK4                      0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0                0x00
+UMASK_RD_CAS_RANK4_BANK1                0x01
+UMASK_RD_CAS_RANK4_BANK2                0x02
+UMASK_RD_CAS_RANK4_BANK3                0x03
+UMASK_RD_CAS_RANK4_BANK4                0x04
+UMASK_RD_CAS_RANK4_BANK5                0x05
+UMASK_RD_CAS_RANK4_BANK6                0x06
+UMASK_RD_CAS_RANK4_BANK7                0x07
+UMASK_RD_CAS_RANK4_BANK8                0x08
+UMASK_RD_CAS_RANK4_BANK9                0x09
+UMASK_RD_CAS_RANK4_BANK10               0x0A
+UMASK_RD_CAS_RANK4_BANK11               0x0B
+UMASK_RD_CAS_RANK4_BANK12               0x0C
+UMASK_RD_CAS_RANK4_BANK13               0x0D
+UMASK_RD_CAS_RANK4_BANK14               0x0E
+UMASK_RD_CAS_RANK4_BANK15               0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS             0x10
+UMASK_RD_CAS_RANK4_BANKG0               0x11
+UMASK_RD_CAS_RANK4_BANKG1               0x12
+UMASK_RD_CAS_RANK4_BANKG2               0x13
+UMASK_RD_CAS_RANK4_BANKG3               0x14
+
+EVENT_RD_CAS_RANK5                      0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0                0x00
+UMASK_RD_CAS_RANK5_BANK1                0x01
+UMASK_RD_CAS_RANK5_BANK2                0x02
+UMASK_RD_CAS_RANK5_BANK3                0x03
+UMASK_RD_CAS_RANK5_BANK4                0x04
+UMASK_RD_CAS_RANK5_BANK5                0x05
+UMASK_RD_CAS_RANK5_BANK6                0x06
+UMASK_RD_CAS_RANK5_BANK7                0x07
+UMASK_RD_CAS_RANK5_BANK8                0x08
+UMASK_RD_CAS_RANK5_BANK9                0x09
+UMASK_RD_CAS_RANK5_BANK10               0x0A
+UMASK_RD_CAS_RANK5_BANK11               0x0B
+UMASK_RD_CAS_RANK5_BANK12               0x0C
+UMASK_RD_CAS_RANK5_BANK13               0x0D
+UMASK_RD_CAS_RANK5_BANK14               0x0E
+UMASK_RD_CAS_RANK5_BANK15               0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS             0x10
+UMASK_RD_CAS_RANK5_BANKG0               0x11
+UMASK_RD_CAS_RANK5_BANKG1               0x12
+UMASK_RD_CAS_RANK5_BANKG2               0x13
+UMASK_RD_CAS_RANK5_BANKG3               0x14
+
+EVENT_RD_CAS_RANK6                      0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0                0x00
+UMASK_RD_CAS_RANK6_BANK1                0x01
+UMASK_RD_CAS_RANK6_BANK2                0x02
+UMASK_RD_CAS_RANK6_BANK3                0x03
+UMASK_RD_CAS_RANK6_BANK4                0x04
+UMASK_RD_CAS_RANK6_BANK5                0x05
+UMASK_RD_CAS_RANK6_BANK6                0x06
+UMASK_RD_CAS_RANK6_BANK7                0x07
+UMASK_RD_CAS_RANK6_BANK8                0x08
+UMASK_RD_CAS_RANK6_BANK9                0x09
+UMASK_RD_CAS_RANK6_BANK10               0x0A
+UMASK_RD_CAS_RANK6_BANK11               0x0B
+UMASK_RD_CAS_RANK6_BANK12               0x0C
+UMASK_RD_CAS_RANK6_BANK13               0x0D
+UMASK_RD_CAS_RANK6_BANK14               0x0E
+UMASK_RD_CAS_RANK6_BANK15               0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS             0x10
+UMASK_RD_CAS_RANK6_BANKG0               0x11
+UMASK_RD_CAS_RANK6_BANKG1               0x12
+UMASK_RD_CAS_RANK6_BANKG2               0x13
+UMASK_RD_CAS_RANK6_BANKG3               0x14
+
+EVENT_RD_CAS_RANK7                      0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0                0x00
+UMASK_RD_CAS_RANK7_BANK1                0x01
+UMASK_RD_CAS_RANK7_BANK2                0x02
+UMASK_RD_CAS_RANK7_BANK3                0x03
+UMASK_RD_CAS_RANK7_BANK4                0x04
+UMASK_RD_CAS_RANK7_BANK5                0x05
+UMASK_RD_CAS_RANK7_BANK6                0x06
+UMASK_RD_CAS_RANK7_BANK7                0x07
+UMASK_RD_CAS_RANK7_BANK8                0x08
+UMASK_RD_CAS_RANK7_BANK9                0x09
+UMASK_RD_CAS_RANK7_BANK10               0x0A
+UMASK_RD_CAS_RANK7_BANK11               0x0B
+UMASK_RD_CAS_RANK7_BANK12               0x0C
+UMASK_RD_CAS_RANK7_BANK13               0x0D
+UMASK_RD_CAS_RANK7_BANK14               0x0E
+UMASK_RD_CAS_RANK7_BANK15               0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS             0x10
+UMASK_RD_CAS_RANK7_BANKG0               0x11
+UMASK_RD_CAS_RANK7_BANKG1               0x12
+UMASK_RD_CAS_RANK7_BANKG2               0x13
+UMASK_RD_CAS_RANK7_BANKG3               0x14
+
+EVENT_RPQ_CYCLES_NE                     0x11 MBOX
+UMASK_RPQ_CYCLES_NE                     0x00
+
+EVENT_RPQ_INSERTS                       0x10 MBOX
+UMASK_RPQ_INSERTS                       0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY             0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY             0x00
+
+EVENT_VMSE_WR_PUSH                      0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM                  0x01
+UMASK_VMSE_WR_PUSH_RMM                  0x02
+
+EVENT_WMM_TO_RMM                        0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH             0x01
+UMASK_WMM_TO_RMM_STARVE                 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY             0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS                       0x20 MBOX
+UMASK_WPQ_INSERTS                       0x00
+
+EVENT_WPQ_CYCLES_FULL                   0x22 MBOX
+UMASK_WPQ_CYCLES_FULL                   0x00
+
+EVENT_WPQ_CYCLES_NE                     0x21 MBOX
+UMASK_WPQ_CYCLES_NE                     0x00
+
+EVENT_WPQ_READ_HIT                      0x23 MBOX
+UMASK_WPQ_READ_HIT                      0x00
+
+EVENT_WPQ_WRITE_HIT                     0x24 MBOX
+UMASK_WPQ_WRITE_HIT                     0x00
+
+EVENT_WRONG_MM                          0xC1 MBOX
+UMASK_WRONG_MM                          0x00
+
+EVENT_WR_CAS_RANK0                      0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0                0x00
+UMASK_WR_CAS_RANK0_BANK1                0x01
+UMASK_WR_CAS_RANK0_BANK2                0x02
+UMASK_WR_CAS_RANK0_BANK3                0x03
+UMASK_WR_CAS_RANK0_BANK4                0x04
+UMASK_WR_CAS_RANK0_BANK5                0x05
+UMASK_WR_CAS_RANK0_BANK6                0x06
+UMASK_WR_CAS_RANK0_BANK7                0x07
+UMASK_WR_CAS_RANK0_BANK8                0x08
+UMASK_WR_CAS_RANK0_BANK9                0x09
+UMASK_WR_CAS_RANK0_BANK10               0x0A
+UMASK_WR_CAS_RANK0_BANK11               0x0B
+UMASK_WR_CAS_RANK0_BANK12               0x0C
+UMASK_WR_CAS_RANK0_BANK13               0x0D
+UMASK_WR_CAS_RANK0_BANK14               0x0E
+UMASK_WR_CAS_RANK0_BANK15               0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS             0x10
+UMASK_WR_CAS_RANK0_BANKG0               0x11
+UMASK_WR_CAS_RANK0_BANKG1               0x12
+UMASK_WR_CAS_RANK0_BANKG2               0x13
+UMASK_WR_CAS_RANK0_BANKG3               0x14
+
+EVENT_WR_CAS_RANK1                      0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0                0x00
+UMASK_WR_CAS_RANK1_BANK1                0x01
+UMASK_WR_CAS_RANK1_BANK2                0x02
+UMASK_WR_CAS_RANK1_BANK3                0x03
+UMASK_WR_CAS_RANK1_BANK4                0x04
+UMASK_WR_CAS_RANK1_BANK5                0x05
+UMASK_WR_CAS_RANK1_BANK6                0x06
+UMASK_WR_CAS_RANK1_BANK7                0x07
+UMASK_WR_CAS_RANK1_BANK8                0x08
+UMASK_WR_CAS_RANK1_BANK9                0x09
+UMASK_WR_CAS_RANK1_BANK10               0x0A
+UMASK_WR_CAS_RANK1_BANK11               0x0B
+UMASK_WR_CAS_RANK1_BANK12               0x0C
+UMASK_WR_CAS_RANK1_BANK13               0x0D
+UMASK_WR_CAS_RANK1_BANK14               0x0E
+UMASK_WR_CAS_RANK1_BANK15               0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS             0x10
+UMASK_WR_CAS_RANK1_BANKG0               0x11
+UMASK_WR_CAS_RANK1_BANKG1               0x12
+UMASK_WR_CAS_RANK1_BANKG2               0x13
+UMASK_WR_CAS_RANK1_BANKG3               0x14
+
+EVENT_WR_CAS_RANK2                      0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0                0x00
+UMASK_WR_CAS_RANK2_BANK1                0x01
+UMASK_WR_CAS_RANK2_BANK2                0x02
+UMASK_WR_CAS_RANK2_BANK3                0x03
+UMASK_WR_CAS_RANK2_BANK4                0x04
+UMASK_WR_CAS_RANK2_BANK5                0x05
+UMASK_WR_CAS_RANK2_BANK6                0x06
+UMASK_WR_CAS_RANK2_BANK7                0x07
+UMASK_WR_CAS_RANK2_BANK8                0x08
+UMASK_WR_CAS_RANK2_BANK9                0x09
+UMASK_WR_CAS_RANK2_BANK10               0x0A
+UMASK_WR_CAS_RANK2_BANK11               0x0B
+UMASK_WR_CAS_RANK2_BANK12               0x0C
+UMASK_WR_CAS_RANK2_BANK13               0x0D
+UMASK_WR_CAS_RANK2_BANK14               0x0E
+UMASK_WR_CAS_RANK2_BANK15               0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS             0x10
+UMASK_WR_CAS_RANK2_BANKG0               0x11
+UMASK_WR_CAS_RANK2_BANKG1               0x12
+UMASK_WR_CAS_RANK2_BANKG2               0x13
+UMASK_WR_CAS_RANK2_BANKG3               0x14
+
+EVENT_WR_CAS_RANK3                      0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0                0x00
+UMASK_WR_CAS_RANK3_BANK1                0x01
+UMASK_WR_CAS_RANK3_BANK2                0x02
+UMASK_WR_CAS_RANK3_BANK3                0x03
+UMASK_WR_CAS_RANK3_BANK4                0x04
+UMASK_WR_CAS_RANK3_BANK5                0x05
+UMASK_WR_CAS_RANK3_BANK6                0x06
+UMASK_WR_CAS_RANK3_BANK7                0x07
+UMASK_WR_CAS_RANK3_BANK8                0x08
+UMASK_WR_CAS_RANK3_BANK9                0x09
+UMASK_WR_CAS_RANK3_BANK10               0x0A
+UMASK_WR_CAS_RANK3_BANK11               0x0B
+UMASK_WR_CAS_RANK3_BANK12               0x0C
+UMASK_WR_CAS_RANK3_BANK13               0x0D
+UMASK_WR_CAS_RANK3_BANK14               0x0E
+UMASK_WR_CAS_RANK3_BANK15               0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS             0x10
+UMASK_WR_CAS_RANK3_BANKG0               0x11
+UMASK_WR_CAS_RANK3_BANKG1               0x12
+UMASK_WR_CAS_RANK3_BANKG2               0x13
+UMASK_WR_CAS_RANK3_BANKG3               0x14
+
+EVENT_WR_CAS_RANK4                      0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0                0x00
+UMASK_WR_CAS_RANK4_BANK1                0x01
+UMASK_WR_CAS_RANK4_BANK2                0x02
+UMASK_WR_CAS_RANK4_BANK3                0x03
+UMASK_WR_CAS_RANK4_BANK4                0x04
+UMASK_WR_CAS_RANK4_BANK5                0x05
+UMASK_WR_CAS_RANK4_BANK6                0x06
+UMASK_WR_CAS_RANK4_BANK7                0x07
+UMASK_WR_CAS_RANK4_BANK8                0x08
+UMASK_WR_CAS_RANK4_BANK9                0x09
+UMASK_WR_CAS_RANK4_BANK10               0x0A
+UMASK_WR_CAS_RANK4_BANK11               0x0B
+UMASK_WR_CAS_RANK4_BANK12               0x0C
+UMASK_WR_CAS_RANK4_BANK13               0x0D
+UMASK_WR_CAS_RANK4_BANK14               0x0E
+UMASK_WR_CAS_RANK4_BANK15               0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS             0x10
+UMASK_WR_CAS_RANK4_BANKG0               0x11
+UMASK_WR_CAS_RANK4_BANKG1               0x12
+UMASK_WR_CAS_RANK4_BANKG2               0x13
+UMASK_WR_CAS_RANK4_BANKG3               0x14
+
+EVENT_WR_CAS_RANK5                      0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0                0x00
+UMASK_WR_CAS_RANK5_BANK1                0x01
+UMASK_WR_CAS_RANK5_BANK2                0x02
+UMASK_WR_CAS_RANK5_BANK3                0x03
+UMASK_WR_CAS_RANK5_BANK4                0x04
+UMASK_WR_CAS_RANK5_BANK5                0x05
+UMASK_WR_CAS_RANK5_BANK6                0x06
+UMASK_WR_CAS_RANK5_BANK7                0x07
+UMASK_WR_CAS_RANK5_BANK8                0x08
+UMASK_WR_CAS_RANK5_BANK9                0x09
+UMASK_WR_CAS_RANK5_BANK10               0x0A
+UMASK_WR_CAS_RANK5_BANK11               0x0B
+UMASK_WR_CAS_RANK5_BANK12               0x0C
+UMASK_WR_CAS_RANK5_BANK13               0x0D
+UMASK_WR_CAS_RANK5_BANK14               0x0E
+UMASK_WR_CAS_RANK5_BANK15               0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS             0x10
+UMASK_WR_CAS_RANK5_BANKG0               0x11
+UMASK_WR_CAS_RANK5_BANKG1               0x12
+UMASK_WR_CAS_RANK5_BANKG2               0x13
+UMASK_WR_CAS_RANK5_BANKG3               0x14
+
+EVENT_WR_CAS_RANK6                      0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0                0x00
+UMASK_WR_CAS_RANK6_BANK1                0x01
+UMASK_WR_CAS_RANK6_BANK2                0x02
+UMASK_WR_CAS_RANK6_BANK3                0x03
+UMASK_WR_CAS_RANK6_BANK4                0x04
+UMASK_WR_CAS_RANK6_BANK5                0x05
+UMASK_WR_CAS_RANK6_BANK6                0x06
+UMASK_WR_CAS_RANK6_BANK7                0x07
+UMASK_WR_CAS_RANK6_BANK8                0x08
+UMASK_WR_CAS_RANK6_BANK9                0x09
+UMASK_WR_CAS_RANK6_BANK10               0x0A
+UMASK_WR_CAS_RANK6_BANK11               0x0B
+UMASK_WR_CAS_RANK6_BANK12               0x0C
+UMASK_WR_CAS_RANK6_BANK13               0x0D
+UMASK_WR_CAS_RANK6_BANK14               0x0E
+UMASK_WR_CAS_RANK6_BANK15               0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS             0x10
+UMASK_WR_CAS_RANK6_BANKG0               0x11
+UMASK_WR_CAS_RANK6_BANKG1               0x12
+UMASK_WR_CAS_RANK6_BANKG2               0x13
+UMASK_WR_CAS_RANK6_BANKG3               0x14
+
+EVENT_WR_CAS_RANK7                      0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0                0x00
+UMASK_WR_CAS_RANK7_BANK1                0x01
+UMASK_WR_CAS_RANK7_BANK2                0x02
+UMASK_WR_CAS_RANK7_BANK3                0x03
+UMASK_WR_CAS_RANK7_BANK4                0x04
+UMASK_WR_CAS_RANK7_BANK5                0x05
+UMASK_WR_CAS_RANK7_BANK6                0x06
+UMASK_WR_CAS_RANK7_BANK7                0x07
+UMASK_WR_CAS_RANK7_BANK8                0x08
+UMASK_WR_CAS_RANK7_BANK9                0x09
+UMASK_WR_CAS_RANK7_BANK10               0x0A
+UMASK_WR_CAS_RANK7_BANK11               0x0B
+UMASK_WR_CAS_RANK7_BANK12               0x0C
+UMASK_WR_CAS_RANK7_BANK13               0x0D
+UMASK_WR_CAS_RANK7_BANK14               0x0E
+UMASK_WR_CAS_RANK7_BANK15               0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS             0x10
+UMASK_WR_CAS_RANK7_BANKG0               0x11
+UMASK_WR_CAS_RANK7_BANKG1               0x12
+UMASK_WR_CAS_RANK7_BANKG2               0x13
+UMASK_WR_CAS_RANK7_BANKG3               0x14
+
+EVENT_PBOX_CLOCKTICKS                   0x01 PBOX
+UMASK_PBOX_CLOCKTICKS                   0x00
+
+EVENT_IIO_CREDIT                        0x2D PBOX
+UMASK_IIO_CREDIT_PRQ_QPI0               0x01
+UMASK_IIO_CREDIT_PRQ_QPI1               0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0             0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1             0x08
+
+EVENT_RING_AD_USED                      0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+UMASK_RING_AD_USED_ANY                  0x0F
+
+EVENT_RING_AK_BOUNCES                   0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP                0x01
+UMASK_RING_AK_BOUNCES_DN                0x02
+
+EVENT_RING_AK_USED                      0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+UMASK_RING_AK_USED_ANY                  0x0F
+
+EVENT_RING_BL_USED                      0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+UMASK_RING_BL_USED_ANY                  0x0F
+
+EVENT_RING_IV_USED                      0x09 PBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RXR_CYCLES_NE                     0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB                 0x10
+UMASK_RXR_CYCLES_NE_NCS                 0x20
+
+EVENT_RXR_INSERTS                       0x11 PBOX
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_OCCUPANCY                     0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS                 0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x28 PBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_STALL_NO_SBO_CREDIT               0x2C PBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_TXR_NACK_CW                       0x26 PBOX
+UMASK_TXR_NACK_CW_DN_AD                 0x01
+UMASK_TXR_NACK_CW_DN_BL                 0x02
+UMASK_TXR_NACK_CW_DN_AK                 0x04
+UMASK_TXR_NACK_CW_UP_AD                 0x08
+UMASK_TXR_NACK_CW_UP_BL                 0x10
+UMASK_TXR_NACK_CW_UP_AK                 0x20
+UMASK_TXR_NACK_CW_AD                    0x09
+UMASK_TXR_NACK_CW_BL                    0x12
+UMASK_TXR_NACK_CW_AK                    0x24
+
+EVENT_CACHE_TOTAL_OCCUPANCY             0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY         0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE      0x02
+
+EVENT_COHERENT_OPS                      0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR             0x01
+UMASK_COHERENT_OPS_CRD                  0x02
+UMASK_COHERENT_OPS_DRD                  0x04
+UMASK_COHERENT_OPS_RFO                  0x08
+UMASK_COHERENT_OPS_PCITOM               0x10
+UMASK_COHERENT_OPS_PCIDCAHINT           0x20
+UMASK_COHERENT_OPS_WBMTOI               0x40
+UMASK_COHERENT_OPS_CLFLUSH              0x80
+
+EVENT_MISC0                             0x14 IBOX
+UMASK_MISC0_FAST_REQ                    0x01
+UMASK_MISC0_FAST_REJ                    0x02
+UMASK_MISC0_2ND_RD_INSERT               0x04
+UMASK_MISC0_2ND_WR_INSERT               0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT           0x10
+UMASK_MISC0_FAST_XFER                   0x20
+UMASK_MISC0_PF_ACK_HINT                 0x40
+UMASK_MISC0_PF_TIMEOUT                  0x80
+
+EVENT_MISC1                             0x15 IBOX
+UMASK_MISC1_SLOW_I                      0x01
+UMASK_MISC1_SLOW_S                      0x02
+UMASK_MISC1_SLOW_E                      0x04
+UMASK_MISC1_SLOW_M                      0x08
+UMASK_MISC1_LOST_FWD                    0x10
+UMASK_MISC1_SEC_RCVD_INVLD              0x20
+UMASK_MISC1_SEC_RCVD_VLD                0x40
+UMASK_MISC1_DATA_THROTTLE               0x80
+
+EVENT_SNOOP_RESP                        0x17 IBOX
+UMASK_SNOOP_RESP_MISS                   0x01
+UMASK_SNOOP_RESP_HIT_I                  0x02
+UMASK_SNOOP_RESP_HIT_ES                 0x04
+UMASK_SNOOP_RESP_HIT_M                  0x08
+UMASK_SNOOP_RESP_SNPCODE                0x10
+UMASK_SNOOP_RESP_SNPDATA                0x20
+UMASK_SNOOP_RESP_SNPINV                 0x40
+
+EVENT_TRANSACTIONS                      0x16 IBOX
+UMASK_TRANSACTIONS_READS                0x01
+UMASK_TRANSACTIONS_WRITES               0x02
+UMASK_TRANSACTIONS_RD_PREF              0x04
+UMASK_TRANSACTIONS_WR_PREF              0x08
+UMASK_TRANSACTIONS_ATOMIC               0x10
+UMASK_TRANSACTIONS_OTHER                0x20
+UMASK_TRANSACTIONS_ORDERINGQ            0x40
+
+EVENT_RXR_AK_INSERTS                    0x0A IBOX
+UMASK_RXR_AK_INSERTS                    0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL            0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_DRS_INSERTS                0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS                0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY              0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL            0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCB_INSERTS                0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS                0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY              0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL            0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCS_INSERTS                0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS                0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY              0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY              0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES        0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES        0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_DATA_INSERTS_NCB              0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB              0x00
+
+EVENT_TXR_DATA_INSERTS_NCS              0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS              0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY             0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY             0x00
+
+EVENT_RBOX_CLOCKTICKS                   0x01 RBOX
+UMASK_RBOX_CLOCKTICKS                   0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY             0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0        0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1        0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2        0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3        0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4        0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5        0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6        0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7        0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY             0x1F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8        0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9        0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10       0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11       0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12       0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13       0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14_16    0x40
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO15_17    0x80
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_LO         0x2D RBOX0C0|RBOX1C0
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA0     0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA1     0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCB  0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCS  0x01
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_HI         0x2D RBOX0C1|RBOX1C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA0     0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA1     0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCB  0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCS  0x01
+
+EVENT_QPI0_AD_CREDITS_EMPTY             0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY             0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY             0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY             0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_RING_AD_USED                      0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+UMASK_RING_AD_USED_ANY                  0x0F
+
+EVENT_RING_AK_USED                      0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+UMASK_RING_AK_USED_ANY                  0x0F
+
+EVENT_RING_BL_USED                      0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+UMASK_RING_BL_USED_ANY                  0x0F
+
+EVENT_RING_IV_USED                      0x0A RBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RING_SINK_STARVED                 0x0E RBOX
+UMASK_RING_SINK_STARVED_AK              0x02
+
+EVENT_RXR_CYCLES_NE                     0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM                 0x01
+UMASK_RXR_CYCLES_NE_SNP                 0x02
+UMASK_RXR_CYCLES_NE_NDR                 0x04
+
+EVENT_RXR_CYCLES_NE_VN1                 0x14 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_VN1_HOM             0x01
+UMASK_RXR_CYCLES_NE_VN1_SNP             0x02
+UMASK_RXR_CYCLES_NE_VN1_NDR             0x04
+UMASK_RXR_CYCLES_NE_VN1_DRS             0x08
+UMASK_RXR_CYCLES_NE_VN1_NCB             0x10
+UMASK_RXR_CYCLES_NE_VN1_NCS             0x20
+
+EVENT_RXR_INSERTS                       0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM                   0x01
+UMASK_RXR_INSERTS_SNP                   0x02
+UMASK_RXR_INSERTS_NDR                   0x04
+UMASK_RXR_INSERTS_DRS                   0x08
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_INSERTS_VN1                   0x15 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_VN1_HOM               0x01
+UMASK_RXR_INSERTS_VN1_SNP               0x02
+UMASK_RXR_INSERTS_VN1_NDR               0x04
+UMASK_RXR_INSERTS_VN1_DRS               0x08
+UMASK_RXR_INSERTS_VN1_NCB               0x10
+UMASK_RXR_INSERTS_VN1_NCS               0x20
+
+EVENT_RXR_OCCUPANCY_VN1                 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_VN1_HOM             0x01
+UMASK_RXR_OCCUPANCY_VN1_SNP             0x02
+UMASK_RXR_OCCUPANCY_VN1_NDR             0x04
+UMASK_RXR_OCCUPANCY_VN1_DRS             0x08
+UMASK_RXR_OCCUPANCY_VN1_NCB             0x10
+UMASK_RXR_OCCUPANCY_VN1_NCS             0x20
+
+EVENT_TXR_NACK                          0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK_DN_AD                    0x01
+UMASK_TXR_NACK_DN_BL                    0x02
+UMASK_TXR_NACK_DN_AK                    0x04
+UMASK_TXR_NACK_UP_AD                    0x08
+UMASK_TXR_NACK_UP_BL                    0x10
+UMASK_TXR_NACK_UP_AK                    0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED             0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO1_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_STALL_NO_SBO_CREDIT               0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_VN0_CREDITS_USED                  0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM              0x01
+UMASK_VN0_CREDITS_USED_SNP              0x02
+UMASK_VN0_CREDITS_USED_NDR              0x04
+UMASK_VN0_CREDITS_USED_DRS              0x08
+UMASK_VN0_CREDITS_USED_NCB              0x10
+UMASK_VN0_CREDITS_USED_NCS              0x20
+
+EVENT_VN0_CREDITS_REJECT                0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM            0x01
+UMASK_VN0_CREDITS_REJECT_SNP            0x02
+UMASK_VN0_CREDITS_REJECT_NDR            0x04
+UMASK_VN0_CREDITS_REJECT_DRS            0x08
+UMASK_VN0_CREDITS_REJECT_NCB            0x10
+UMASK_VN0_CREDITS_REJECT_NCS            0x20
+
+EVENT_VN1_CREDITS_USED                  0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_USED_HOM              0x01
+UMASK_VN1_CREDITS_USED_SNP              0x02
+UMASK_VN1_CREDITS_USED_NDR              0x04
+UMASK_VN1_CREDITS_USED_DRS              0x08
+UMASK_VN1_CREDITS_USED_NCB              0x10
+UMASK_VN1_CREDITS_USED_NCS              0x20
+
+EVENT_VN1_CREDITS_REJECT                0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_REJECT_HOM            0x01
+UMASK_VN1_CREDITS_REJECT_SNP            0x02
+UMASK_VN1_CREDITS_REJECT_NDR            0x04
+UMASK_VN1_CREDITS_REJECT_DRS            0x08
+UMASK_VN1_CREDITS_REJECT_NCB            0x10
+UMASK_VN1_CREDITS_REJECT_NCS            0x20
+
+EVENT_VNA_CREDITS_ACQUIRED              0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED_AD           0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL           0x04
+
+EVENT_VNA_CREDITS_REJECT                0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM            0x01
+UMASK_VNA_CREDITS_REJECT_SNP            0x02
+UMASK_VNA_CREDITS_REJECT_NDR            0x04
+UMASK_VNA_CREDITS_REJECT_DRS            0x08
+UMASK_VNA_CREDITS_REJECT_NCB            0x10
+UMASK_VNA_CREDITS_REJECT_NCS            0x20
+
+EVENT_QBOX_CLOCKTICKS                   0x14 QBOX
+UMASK_QBOX_CLOCKTICKS                   0x00
+
+EVENT_CTO_COUNT                         0x38 QBOX
+OPTIONS_CTO_COUNT                       EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MASK2_MASK|EVENT_OPTION_MASK3_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MATCH2_MASK|EVENT_OPTION_MATCH3_MASK
+UMASK_CTO_COUNT                         0x00 0x01 0x00
+
+EVENT_DIRECT2CORE                       0x13 QBOX
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT       0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS       0x02
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT       0x04
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT   0x08
+UMASK_DIRECT2CORE_FAILURE_MISS          0x10
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS  0x20
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS      0x40
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES                   0x12 QBOX
+UMASK_L1_POWER_CYCLES                   0x00
+
+EVENT_RXL0P_POWER_CYCLES                0x10 QBOX
+UMASK_RXL0P_POWER_CYCLES                0x00
+
+EVENT_RXL0_POWER_CYCLES                 0x0F QBOX
+UMASK_RXL0_POWER_CYCLES                 0x00
+
+EVENT_TXL0P_POWER_CYCLES                0x0D QBOX
+UMASK_TXL0P_POWER_CYCLES                0x00
+
+EVENT_TXL0_POWER_CYCLES                 0x0C QBOX
+UMASK_TXL0_POWER_CYCLES                 0x00
+
+EVENT_RXL_BYPASSED                      0x09 QBOX
+UMASK_RXL_BYPASSED                      0x00
+
+EVENT_TXL_BYPASSED                      0x05 QBOX
+UMASK_TXL_BYPASSED                      0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0          0x1E QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS      0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB      0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS      0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM      0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP      0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR      0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN1          0x39 QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS      0x01 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB      0x02 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS      0x04 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM      0x08 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP      0x10 0x01 0x00
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR      0x20 0x01 0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VNA          0x1D QBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA          0x00 0x01 0x00
+
+EVENT_RXL_CYCLES_NE                     0x0A QBOX
+UMASK_RXL_CYCLES_NE                     0x00
+
+EVENT_TXL_CYCLES_NE                     0x06 QBOX
+UMASK_TXL_CYCLES_NE                     0x00
+
+EVENT_RXL_FLITS_G1                      0x02 QBOX
+UMASK_RXL_FLITS_G1_SNP                  0x01 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_REQ              0x02 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM_NONREQ           0x04 0x01 0x00
+UMASK_RXL_FLITS_G1_HOM                  0x06 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_DATA             0x08 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS_NONDATA          0x10 0x01 0x00
+UMASK_RXL_FLITS_G1_DRS                  0x18 0x01 0x00
+
+EVENT_RXL_FLITS_G2                      0x03 QBOX
+UMASK_RXL_FLITS_G2_NDR_AD               0x01 0x01 0x00
+UMASK_RXL_FLITS_G2_NDR_AK               0x02 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_DATA             0x04 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB_NONDATA          0x08 0x01 0x00
+UMASK_RXL_FLITS_G2_NCB                  0x0C 0x01 0x00
+UMASK_RXL_FLITS_G2_NCS                  0x10 0x01 0x00
+
+EVENT_RXL_FLITS_G0                      0x01 QBOX
+UMASK_RXL_FLITS_G0_IDLE                 0x01
+UMASK_RXL_FLITS_G0_DATA                 0x02
+UMASK_RXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_TXL_FLITS_G0                      0x00 QBOX
+UMASK_TXL_FLITS_G0_IDLE                 0x01
+UMASK_TXL_FLITS_G0_DATA                 0x02
+UMASK_TXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_TXL_FLITS_G1                      0x00 QBOX
+UMASK_TXL_FLITS_G1_SNP                  0x01 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_REQ              0x02 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM_NONREQ           0x04 0x01 0x00
+UMASK_TXL_FLITS_G1_HOM                  0x06 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_DATA             0x08 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS_NONDATA          0x10 0x01 0x00
+UMASK_TXL_FLITS_G1_DRS                  0x18 0x01 0x00
+
+EVENT_TXL_FLITS_G2                      0x01 QBOX
+UMASK_TXL_FLITS_G2_NDR_AD               0x01 0x01 0x00
+UMASK_TXL_FLITS_G2_NDR_AK               0x02 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_DATA             0x04 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB_NONDATA          0x08 0x01 0x00
+UMASK_TXL_FLITS_G2_NCB                  0x0C 0x01 0x00
+UMASK_TXL_FLITS_G2_NCS                  0x10 0x01 0x00
+
+EVENT_RXL_INSERTS                       0x08 QBOX
+UMASK_RXL_INSERTS                       0x00
+
+EVENT_TXL_INSERTS                       0x04 QBOX
+UMASK_TXL_INSERTS                       0x00
+
+EVENT_RXL_INSERTS_DRS                   0x09 QBOX
+UMASK_RXL_INSERTS_DRS_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_DRS_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_HOM                   0x0C QBOX
+UMASK_RXL_INSERTS_HOM_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_HOM_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCB                   0x0A QBOX
+UMASK_RXL_INSERTS_NCB_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCB_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NCS                   0x0B QBOX
+UMASK_RXL_INSERTS_NCS_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NCS_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_NDR                   0x0E QBOX
+UMASK_RXL_INSERTS_NDR_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_NDR_VN1               0x02 0x01 0x00
+
+EVENT_RXL_INSERTS_SNP                   0x0D QBOX
+UMASK_RXL_INSERTS_SNP_VN0               0x01 0x01 0x00
+UMASK_RXL_INSERTS_SNP_VN1               0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY                     0x0B QBOX
+UMASK_RXL_OCCUPANCY                     0x00
+
+EVENT_TXL_OCCUPANCY                     0x07 QBOX
+UMASK_TXL_OCCUPANCY                     0x00
+
+EVENT_RXL_OCCUPANCY_DRS                 0x15 QBOX
+UMASK_RXL_OCCUPANCY_DRS_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_DRS_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_HOM                 0x18 QBOX
+UMASK_RXL_OCCUPANCY_HOM_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_HOM_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCB                 0x16 QBOX
+UMASK_RXL_OCCUPANCY_NCB_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCB_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NCS                 0x17 QBOX
+UMASK_RXL_OCCUPANCY_NCS_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NCS_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_NDR                 0x1A QBOX
+UMASK_RXL_OCCUPANCY_NDR_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_NDR_VN1             0x02 0x01 0x00
+
+EVENT_RXL_OCCUPANCY_SNP                 0x19 QBOX
+UMASK_RXL_OCCUPANCY_SNP_VN0             0x01 0x01 0x00
+UMASK_RXL_OCCUPANCY_SNP_VN1             0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_ACQUIRED        0x26 QBOX
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_HOM_CREDIT_OCCUPANCY       0x22 QBOX
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_ACQUIRED        0x28 QBOX
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_NDR_CREDIT_OCCUPANCY       0x24 QBOX
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_ACQUIRED        0x27 QBOX
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_AD_SNP_CREDIT_OCCUPANCY       0x23 QBOX
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_ACQUIRED        0x29 QBOX
+UMASK_TXR_AK_NDR_CREDIT_ACQUIRED        0x00 0x01 0x00
+
+EVENT_TXR_AK_NDR_CREDIT_OCCUPANCY       0x25 QBOX
+UMASK_TXR_AK_NDR_CREDIT_OCCUPANCY       0x00 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_ACQUIRED        0x2A QBOX
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_DRS_CREDIT_OCCUPANCY       0x1F QBOX
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_ACQUIRED        0x2B QBOX
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_BL_NCB_CREDIT_OCCUPANCY       0x20 QBOX
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_ACQUIRED        0x2C QBOX
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN0    0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN1    0x02 0x01 0x00
+
+EVENT_TXR_BL_NCS_CREDIT_OCCUPANCY       0x21 QBOX
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN0   0x01 0x01 0x00
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN1   0x02 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURNS                0x1C QBOX
+UMASK_VNA_CREDIT_RETURNS                0x00 0x01 0x00
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY       0x1B QBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY       0x00 0x01 0x00
+
+EVENT_QPI_RATE                          0x00 QBOX0FIX0|QBOX1FIX0
+UMASK_QPI_RATE                          0x00
+
+EVENT_QPI_RX_IDLE                       0x01 QBOX0FIX1|QBOX1FIX1
+UMASK_QPI_RX_IDLE                       0x00
+
+EVENT_QPI_RX_LLR                        0x02 QBOX0FIX2|QBOX1FIX2
+UMASK_QPI_RX_LLR                        0x00
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_haswell_counters.h
index 3dc7247..4964994 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_haswell_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_haswell_counters.h
  *
- *      Description:  Counter Header File of perfmon module for Haswell.
+ *      Description:  Counter Header File of perfmon module for Intel Haswell.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,26 +29,56 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_HASWELL 12
-#define NUM_COUNTERS_UNCORE_HASWELL 4
+#define NUM_COUNTERS_HASWELL 23
 #define NUM_COUNTERS_CORE_HASWELL 8
+#define NUM_COUNTERS_UNCORE_HASWELL 15
 
-static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
+#define HAS_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define HAS_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, HAS_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, HAS_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, HAS_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
 };
 
+
+static BoxMap haswell_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+    [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index f958a3a..bc5a37d 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_haswell_events.txt
-# 
-#      Description:  Event list for Intel Ivy Bridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Haswell
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -46,6 +47,8 @@ UMASK_INSTR_RETIRED_ANY          0x00
 
 EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
 UMASK_CPU_CLK_UNHALTED_CORE      0x00
+DEFAULT_OPTIONS_CPU_CLK_UNHALTED_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLK_UNHALTED_ANY       0x00
 
 EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
 UMASK_CPU_CLK_UNHALTED_REF       0x00
@@ -54,15 +57,15 @@ EVENT_LD_BLOCKS                 0x03  PMC
 UMASK_LD_BLOCKS_STORE_FORWARD   0x02
 UMASK_LD_BLOCKS_NO_SR           0x08
 
-EVENT_MISALIGN_MEM_REF           0x05  PMC
+EVENT_MISALIGN_MEM_REF            0x05  PMC
 UMASK_MISALIGN_MEM_REF_LOADS      0x01
 UMASK_MISALIGN_MEM_REF_STORES     0x02
 UMASK_MISALIGN_MEM_REF_ANY        0x03
 
-EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
 UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
 
-EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
 UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_LARGE  0x04
@@ -73,125 +76,211 @@ UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M           0x40
 UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
 UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS        0x80
 
-EVENT_INT_MISC            0x0D  PMC
-UMASK_INT_MISC_RECOVERY_CYCLES  0x03 0x01
+EVENT_INT_MISC                  0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY  0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=0x1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY  0x03
 
 EVENT_UOPS_ISSUED                0x0E  PMC
 UMASK_UOPS_ISSUED_ANY            0x01
 UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
 UMASK_UOPS_ISSUED_SLOW_LEA       0x20
 UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
-
-EVENT_L2_RQSTS                   0x24   PMC
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS 0x21
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
-UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD     0xE1
-UMASK_L2_RQSTS_RFO_HIT           0x42
-UMASK_L2_RQSTS_RFO_MISS          0x22
-UMASK_L2_RQSTS_ALL_RFO           0xE2
-UMASK_L2_RQSTS_CODE_RD_HIT        0x44
-UMASK_L2_RQSTS_CODE_RD_MISS       0x24
-UMASK_L2_RQSTS_ALL_DEMAND_MISS   0x27
-UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES   0xE7
-UMASK_L2_RQSTS_ALL_CODE_RD   0xE4
-UMASK_L2_RQSTS_L2_PF_HIT      0x50
-UMASK_L2_RQSTS_L2_PF_MISS     0x30
-UMASK_L2_RQSTS_ALL_PF        0xF8
-UMASK_L2_RQSTS_MISS              0x3F
-UMASK_L2_RQSTS_REFERENCES        0xFF
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_ARITH_DIVIDER_UOPS            0x14 PMC
+UMASK_ARITH_DIVIDER_CYCLES          0x01
+UMASK_ARITH_DIVIDER_UOPS            0x02
+
+EVENT_L2_RQSTS                          0x24   PMC
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS  0x21
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT   0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD       0xE1
+UMASK_L2_RQSTS_RFO_HIT                  0x42
+UMASK_L2_RQSTS_RFO_MISS                 0x22
+UMASK_L2_RQSTS_ALL_RFO                  0xE2
+UMASK_L2_RQSTS_CODE_RD_HIT              0x44
+UMASK_L2_RQSTS_CODE_RD_MISS             0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS          0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES    0xE7
+UMASK_L2_RQSTS_ALL_CODE_RD              0xE4
+UMASK_L2_RQSTS_L2_PF_HIT                0x50
+UMASK_L2_RQSTS_L2_PF_MISS               0x30
+UMASK_L2_RQSTS_ALL_PF                   0xF8
+UMASK_L2_RQSTS_MISS                     0x3F
+UMASK_L2_RQSTS_REFERENCES               0xFF
 
 EVENT_L2_DEMAND_RQST_WB_HIT            0x27   PMC
-UMASK_L2_DEMAND_RQST_WB_HIT       0x50
+UMASK_L2_DEMAND_RQST_WB_HIT            0x50
 
-EVENT_LONGEST_LAT_CACHE_REFERENCE               0x2E   PMC
+EVENT_LONGEST_LAT_CACHE               0x2E   PMC
 UMASK_LONGEST_LAT_CACHE_REFERENCE     0x4F
 UMASK_LONGEST_LAT_CACHE_MISS          0x41
 
 EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY  0x00
 UMASK_CPU_CLOCK_UNHALTED_REF_XCLK     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY     0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
 
-EVENT_L1D_PEND_MISS              0x48   PMC1
+EVENT_L1D_PEND_MISS              0x48   PMC2
 UMASK_L1D_PEND_MISS_PENDING      0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY 0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES 0x01
 
-EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
+EVENT_L1D_PEND_MISS_REQUEST_FB_FULL 0x48 PMC
+UMASK_L1D_PEND_MISS_REQUEST_FB_FULL 0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_FB_FULL EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_FB_FULL         0x02
+
+
+EVENT_DTLB_STORE_MISSES                         0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK           0x01
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K       0x02
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE    0x04
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED          0x0E
-UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x10
+UMASK_DTLB_STORE_MISSES_WALK_DURATION           0x10
 UMASK_DTLB_STORE_MISSES_STLB_HIT_4K             0x20
-UMASK_DTLB_STORE_MISSES_STLB_HIT_LARGE             0x40
-UMASK_DTLB_STORE_MISSES_STLB_HIT              0x60
-UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS              0x80
+UMASK_DTLB_STORE_MISSES_STLB_HIT_2M             0x40
+UMASK_DTLB_STORE_MISSES_STLB_HIT                0x60
+UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS          0x80
 
-EVENT_LOAD_HIT_PRE               0x4C    PMC
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
 UMASK_LOAD_HIT_PRE_SW_PF               0x01
 UMASK_LOAD_HIT_PRE_HW_PF               0x02
 
-EVENT_L1D                        0x51   PMC
+EVENT_EPT_WALK_CYCLES            0x4F PMC
+UMASK_EPT_WALK_CYCLES            0x10
+
+EVENT_L1D                         0x51   PMC
 UMASK_L1D_REPLACEMENT             0x01
 UMASK_L1D_ALLOCATED_IN_M          0x02
 UMASK_L1D_M_EVICT                 0x04
 UMASK_L1D_ALL_M_REPLACEMENT       0x08
 
+EVENT_TX_MEM                                        0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT                         0x01
+UMASK_TX_MEM_ABORT_CAPACITY_WRITE                   0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK         0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH      0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL                0x40
+
 EVENT_MOVE_ELIMINATION                        0x58   PMC
 UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
 UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
 UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
 UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
 
-EVENT_CPL_CYCLES               0x5C    PMC
+EVENT_CPL_CYCLES                   0x5C    PMC
 UMASK_CPL_CYCLES_RING0             0x01
-UMASK_CPL_CYCLES_RING123             0x02
-
-EVENT_RS_EVENTS               0x5E    PMC
+UMASK_CPL_CYCLES_RING123           0x02
+DEFAULT_OPTIONS_CPL_CYCLES_RING0_TRANS EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_CPL_CYCLES_RING0_TRANS       0x01
+
+EVENT_TX_EXEC                       0x5D PMC
+UMASK_TX_EXEC_MISC1                 0x01
+UMASK_TX_EXEC_MISC2                 0x02
+UMASK_TX_EXEC_MISC3                 0x04
+UMASK_TX_EXEC_MISC4                 0x08
+UMASK_TX_EXEC_MISC5                 0x10
+
+EVENT_RS_EVENTS                 0x5E    PMC
 UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_RS_EVENTS_EMPTY_END       0x01
 
-EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+# Errata HSW62: May be unreliable in SMT mode
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
-
-EVENT_CACHE_LOCK_CYCLES          0x63   PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
-
-EVENT_IDQ               0x79   PMC
-UMASK_IDQ_EMPTY         0x02
-UMASK_IDQ_MITE_UOPS     0x04
-UMASK_IDQ_MITE_UOPS_CYCLES  0x04 0x00 0x01
-UMASK_IDQ_DSB_UOPS      0x08
-UMASK_IDQ_DSB_UOPS_CYCLES  0x08 0x00 0x01
-UMASK_IDQ_MS_DSB_UOPS   0x10
-UMASK_IDQ_MS_DSB_UOPS_CYCLES  0x10 0x00 0x01
-UMASK_IDQ_MS_MITE_UOPS  0x20
-UMASK_IDQ_MS_MITE_UOPS_CYCLES  0x20 0x00 0x01
-UMASK_IDQ_MS_UOPS       0x30
-UMASK_IDQ_MS_UOPS_CYCLES  0x30 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18 0x00 0x04
-UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS       0x24 0x00 0x01
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
-UMASK_IDQ_MITE_ALL_UOPS       0x3C
-
-EVENT_ICACHE                  0x80   PMC
-UMASK_ICACHE_HITS             0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_C6 0x01
+
+EVENT_LOCK_CYCLES                               0x63   PMC
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION   0x01
+DEFAULT_OPTIONS_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT   0x01
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION           0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT           0x02
+
+EVENT_IDQ                               0x79   PMC
+UMASK_IDQ_EMPTY                         0x02
+UMASK_IDQ_MITE_UOPS                     0x04
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES                   0x04
+UMASK_IDQ_DSB_UOPS                      0x08
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES          EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES                    0x08
+UMASK_IDQ_MS_DSB_UOPS                   0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES       EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES                 0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_OCCUR        EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_DSB_OCCUR                  0x10
+UMASK_IDQ_MS_MITE_UOPS                  0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_CYCLES      EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_MITE_CYCLES                0x20
+UMASK_IDQ_MS_UOPS                       0x30
+DEFAULT_OPTIONS_IDQ_MS_CYCLES           EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES                     0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES         EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_IDQ_MS_SWITCHES                   0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS       0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS         0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS  EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS      0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS        0x24
+UMASK_IDQ_MITE_ALL_UOPS                 0x3C
+
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HIT                0x01
 UMASK_ICACHE_MISSES             0x02
 UMASK_ICACHE_ACCESSES           0x03
 UMASK_ICACHE_IFETCH_STALL       0x04
 
-EVENT_ITLB_MISSES                 0x85      PMC
-UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
-UMASK_ITLB_MISSES_WALK_COMPLETED_4K  0x02
+EVENT_ITLB_MISSES                       0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK         0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K     0x02
 UMASK_ITLB_MISSES_WALK_COMPLETED_LARGE  0x04
-UMASK_ITLB_MISSES_WALK_COMPLETED     0x0E
-UMASK_ITLB_MISSES_WALK_DURATION   0x10
-UMASK_ITLB_MISSES_STLB_HIT_4K   0x20
-UMASK_ITLB_MISSES_STLB_HIT_2M   0x40
-UMASK_ITLB_MISSES_STLB_HIT   0x60
+UMASK_ITLB_MISSES_WALK_COMPLETED        0x0E
+UMASK_ITLB_MISSES_WALK_DURATION         0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K           0x20
+UMASK_ITLB_MISSES_STLB_HIT_2M           0x40
+UMASK_ITLB_MISSES_STLB_HIT              0x60
 
 EVENT_ILD_STALL                 0x87      PMC
 UMASK_ILD_STALL_LCP             0x01
@@ -201,25 +290,51 @@ EVENT_BR_INST_EXEC                                      0x88   PMC
 UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
 UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
 UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
 UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_INST_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_INST_EXEC_ALL_DIRECT_JMP                       0xC2
+UMASK_BR_INST_EXEC_ALL_DIRECT_NEAR_CALL                 0xD0
+UMASK_BR_INST_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
+UMASK_BR_INST_EXEC_ALL_INDIRECT_NEAR_RETURN             0xC8
 UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_BR_MISP_EXEC                                      0x89   PMC
 UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
 UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
 UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_MISP_EXEC_ALL_CONDITIONAL                      0xC1
+UMASK_BR_MISP_EXEC_ALL_INDIRECT_JUMP_NON_CALL_RET       0xC4
 UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
 UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
-
-EVENT_UOPS_EXECUTED_PORT                 0xA1   PMC
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
 UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
 UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
 UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
@@ -228,44 +343,111 @@ UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
 UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
 UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
 UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_0_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_0_CORE      0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_1_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_1_CORE      0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_2_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_3_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_4_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_4_CORE      0x10
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_5_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_5_CORE      0x20
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_6_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_6_CORE      0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_PORT_7_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_PORT_7_CORE      0x80
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS      0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_PORT_DATA_PORTS       0x9C
 
 EVENT_RESOURCE_STALLS                 0xA2   PMC
 UMASK_RESOURCE_STALLS_ANY             0x01
 UMASK_RESOURCE_STALLS_RS              0x04
-UMASK_RESOURCE_STALLS_SB               0x08
+UMASK_RESOURCE_STALLS_SB              0x08
 UMASK_RESOURCE_STALLS_ROB             0x10
 
 EVENT_CYCLE_ACTIVITY                 0xA3   PMC
-UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
-UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING_CYCLES      0x01 0x00 0x02
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING              0x02
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING_CYCLES      0x01 0x00 0x02
+# Errata HSW62: May be unreliable in SMT mode
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_PENDING  EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING            0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY     EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY               0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE  EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE            0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_PENDING  EVENT_OPTION_THRESHOLD=0x5
 UMASK_CYCLE_ACTIVITY_STALLS_L2_PENDING            0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY     EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY               0x06
 
 EVENT_CYCLE_ACTIVITY_CYCLES                 0xA3   PMC2
-UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING               0x08
-UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING_CYCLES        0x08 0x00 0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_PENDING EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_PENDING     0x08
 
 EVENT_CYCLE_ACTIVITY_STALLS                 0xA3   PMC2
-UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING               0x0C
-UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING_CYCLES        0x0C 0x00 0x0C
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L1D_PENDING EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_STALLS_L1D_PENDING     0x0C
+
+EVENT_LSD_UOPS                  0xA8   PMC
+UMASK_LSD_UOPS                  0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS         0x01
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE         0x01
 
-EVENT_LSD_UOPS                 0xA8   PMC
-UMASK_LSD_UOPS             0x01
+EVENT_DSB2MITE_SWITCHES                0xAB PMC
+UMASK_DSB2MITE_SWITCHES_COUNT          0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
 
-EVENT_ITLB                         0xAE   PMC
-UMASK_ITLB_ITLB_FLUSH            0x01
+EVENT_ITLB                          0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH               0x01
 
-EVENT_OFFCORE_REQUESTS     0xB0   PMC
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
 UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
 UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
 UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
 
-EVENT_UOPS_EXECUTED               0xB1   PMC
-UMASK_UOPS_EXECUTED_CORE              0x02
-
-EVENT_PAGE_WALKER_LOADS          0xBC  PMC
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                  0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL 0x01
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
 UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
 UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
 UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
@@ -274,14 +456,25 @@ UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
 UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
 UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY     0x18
 UMASK_PAGE_WALKER_LOADS_ITLB_MEMORY     0x28
-
-EVENT_TLB_FLUSH          0xBD  PMC
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L1 0x41
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L1 0x81
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L2 0x42
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L2 0x82
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_L3 0x44
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_L3 0x84
+UMASK_PAGE_WALKER_LOADS_EPT_DTLB_MEMORY 0x48
+UMASK_PAGE_WALKER_LOADS_EPT_ITLB_MEMORY 0x88
+
+EVENT_TLB_FLUSH                 0xBD  PMC
 UMASK_TLB_FLUSH_DTLB_THREAD     0x01
 UMASK_TLB_FLUSH_STLB_ANY        0x20
 
-EVENT_INST_RETIRED                  0xC0  PMC1
+EVENT_INST_RETIRED_ANY              0xC0  PMC
 UMASK_INST_RETIRED_ANY_P            0x00
 
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
 EVENT_OTHER_ASSISTS                  0xC1  PMC
 UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x08
 UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x10
@@ -289,9 +482,28 @@ UMASK_OTHER_ASSISTS_ANY_WB_ASSIST         0x40
 
 EVENT_UOPS_RETIRED                  0xC2  PMC
 UMASK_UOPS_RETIRED_ALL              0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL         0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
-
-EVENT_MACHINE_CLEARS              0xC3  PMC
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
 UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
 UMASK_MACHINE_CLEARS_SMC                0x04
 UMASK_MACHINE_CLEARS_MASKMOV            0x20
@@ -306,35 +518,63 @@ UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
 UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
 UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
 
-EVENT_BR_MISP_RETIRED               0xC5  PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES_1  0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES_2     0x04
-UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN      0x10
-UMASK_BR_MISP_RETIRED_NEAR_TAKEN      0x20
-
-EVENT_FP_ASSIST               0xCA  PMC
-UMASK_FP_ASSIST_X87_OUTPUT               0x02
-UMASK_FP_ASSIST_X87_INPUT                0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
-UMASK_FP_ASSIST_SIMD_INPUT               0x10
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_NEAR_NOT_TAKEN 0x10
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN     0x20
+
+EVENT_AVX_INSTS                     0xC6 PMC
+UMASK_AVX_INSTS_LOADS               0x01
+UMASK_AVX_INSTS_STORES              0x02
+UMASK_AVX_INSTS_CALC                0x04
+UMASK_AVX_INSTS_ALL                 0x07
+
+
+EVENT_HLE_RETIRED                    0xC8 PMC
+UMASK_HLE_RETIRED_START              0x01
+UMASK_HLE_RETIRED_COMMIT             0x02
+UMASK_HLE_RETIRED_ABORTED            0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1      0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2      0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3      0x20
+# Errata HSW65: May overcount
+UMASK_HLE_RETIRED_ABORTED_MISC4      0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5      0x80
+
+EVENT_RTM_RETIRED                    0xC9 PMC
+UMASK_RTM_RETIRED_START              0x01
+UMASK_RTM_RETIRED_COMMIT             0x02
+UMASK_RTM_RETIRED_ABORTED            0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1      0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2      0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3      0x20
+# Errata HSW65: May overcount732H
+UMASK_RTM_RETIRED_ABORTED_MISC4      0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5      0x80
+
+
+EVENT_FP_ASSIST                   0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT        0x02
+UMASK_FP_ASSIST_X87_INPUT         0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT       0x08
+UMASK_FP_ASSIST_SIMD_INPUT        0x10
+DEFAULT_OPTIONS_FP_ASSIST_ANY     EVENT_OPTION_THRESHOLD=0x1
 UMASK_FP_ASSIST_ANY               0x1E
 
 EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
 UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
 
-EVENT_MEM_TRANS_RETIRED_LOAD_LAT               0xCD  PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY           0x01
-
-EVENT_MEM_UOP_RETIRED            0xD0    PMC
-UMASK_MEM_UOP_RETIRED_LOADS            0x81
-UMASK_MEM_UOP_RETIRED_STORES           0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+EVENT_MEM_UOPS_RETIRED            0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS            0x81
+UMASK_MEM_UOPS_RETIRED_STORES           0x82
+UMASK_MEM_UOPS_RETIRED_ALL              0x83
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS         0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS        0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK              0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK             0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT             0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT            0x42
 
 EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
 UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
@@ -347,6 +587,9 @@ UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
 UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
 UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
 UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_MISS     0x38
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_HIT      0x07
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL_ALL      0x3F
 
 EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED               0xD2   PMC
 UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS         0x01
@@ -354,8 +597,8 @@ UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT          0x02
 UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM         0x04
 UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE         0x08
 
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
+EVENT_MEM_LOAD_UOPS_L3_MISS_RETIRED               0xD3   PMC
+UMASK_MEM_LOAD_UOPS_L3_MISS_RETIRED_LOCAL_DRAM      0x01
 
 EVENT_BACLEARS               0xE6   PMC
 UMASK_BACLEARS_ANY           0x1F
@@ -364,55 +607,113 @@ EVENT_L2_TRANS               0xF0  PMC
 UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
 UMASK_L2_TRANS_RFO           0x02
 UMASK_L2_TRANS_CODE_RD       0x04
-UMASK_L2_TRANS_ALL_PREF      0x08
+UMASK_L2_TRANS_ALL_PF        0x08
 UMASK_L2_TRANS_L1D_WB        0x10
 UMASK_L2_TRANS_L2_FILL       0x20
 UMASK_L2_TRANS_L2_WB         0x40
 UMASK_L2_TRANS_ALL_REQUESTS  0x80
 
 EVENT_L2_LINES_IN                   0xF1   PMC
-UMASK_L2_LINES_IN_I           0x01
-UMASK_L2_LINES_IN_S            0x02
-UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_I                 0x01
+UMASK_L2_LINES_IN_S                 0x02
+UMASK_L2_LINES_IN_E                 0x04
 UMASK_L2_LINES_IN_ALL               0x07
 
 EVENT_L2_LINES_OUT                  0xF2   PMC
 UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
 UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
 
-EVENT_TX_MEM_ABORT_CONFLICT          0x54   PMC
-UMASK_TX_MEM_ABORT_CONFLICT     0x01
-UMASK_TX_MEM_ABORT_CAPACITY     0x02
-UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK     0x04
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH     0x10
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPP_ALIGNMENT     0x20
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_FULL     0x40
-
-EVENT_TX_EXEC          0x5D   PMC
-UMASK_TX_EXEC_MISC1     0x01
-UMASK_TX_EXEC_MISC2     0x02
-UMASK_TX_EXEC_MISC3     0x04
-UMASK_TX_EXEC_MISC4     0x08
-UMASK_TX_EXEC_MISC5     0x10
-
-
-EVENT_HLE_RETIRED                  0xC8   PMC
-UMASK_HLE_RETIRED_START            0x01
-UMASK_HLE_RETIRED_COMMIT           0x02
-UMASK_HLE_RETIRED_ABORTED           0x04
-UMASK_HLE_RETIRED_ABORTED_MISC1     0x08
-UMASK_HLE_RETIRED_ABORTED_MISC2     0x10
-UMASK_HLE_RETIRED_ABORTED_MISC3     0x20
-UMASK_HLE_RETIRED_ABORTED_MISC4     0x40
-UMASK_HLE_RETIRED_ABORTED_MISC5     0x80
-
-EVENT_RTM_RETIRED                  0xC9   PMC
-UMASK_RTM_RETIRED_START            0x01
-UMASK_RTM_RETIRED_COMMIT           0x02
-UMASK_RTM_RETIRED_ABORTED           0x04
-UMASK_RTM_RETIRED_ABORTED_MISC1     0x08
-UMASK_RTM_RETIRED_ABORTED_MISC2     0x10
-UMASK_RTM_RETIRED_ABORTED_MISC3     0x20
-UMASK_RTM_RETIRED_ABORTED_MISC4     0x40
-UMASK_RTM_RETIRED_ABORTED_MISC5     0x80
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP                          0x34 CBOX
+UMASK_CACHE_LOOKUP_M                        0x01
+UMASK_CACHE_LOOKUP_E                        0x02
+UMASK_CACHE_LOOKUP_S                        0x04
+UMASK_CACHE_LOOKUP_I                        0x08
+UMASK_CACHE_LOOKUP_READ_FILTER              0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER             0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER            0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER       0x80
+UMASK_CACHE_LOOKUP_READ_M                   0x11
+UMASK_CACHE_LOOKUP_WRITE_M                  0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M                 0x41
+UMASK_CACHE_LOOKUP_ANY_M                    0x81
+UMASK_CACHE_LOOKUP_READ_E                   0x12
+UMASK_CACHE_LOOKUP_WRITE_E                  0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E                 0x42
+UMASK_CACHE_LOOKUP_ANY_E                    0x82
+UMASK_CACHE_LOOKUP_READ_S                   0x14
+UMASK_CACHE_LOOKUP_WRITE_S                  0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S                 0x44
+UMASK_CACHE_LOOKUP_ANY_S                    0x84
+UMASK_CACHE_LOOKUP_READ_ES                  0x16
+UMASK_CACHE_LOOKUP_WRITE_ES                 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES                0x46
+UMASK_CACHE_LOOKUP_ANY_ES                   0x86
+UMASK_CACHE_LOOKUP_READ_I                   0x18
+UMASK_CACHE_LOOKUP_WRITE_I                  0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I                 0x48
+UMASK_CACHE_LOOKUP_ANY_I                    0x88
+UMASK_CACHE_LOOKUP_READ_MESI                0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI               0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI              0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI                 0x8F
+
+EVENT_XSNP_RESPONSE                         0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL           0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE              0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION           0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL            0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE               0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION            0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL           0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE              0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION           0x88
+
+EVENT_TRK_OCCUPANCY_ALL                     0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL                     0x01
+
+EVENT_TRK_REQUESTS                          0x81 UBOX
+UMASK_TRK_REQUESTS_ALL                      0x01
+UMASK_TRK_REQUESTS_WRITES                   0x20
+
+EVENT_COH_TRK_OCCUPANCY                     0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY                     0x01
+
+EVENT_COH_TRK_REQUESTS                      0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL                  0x01
+
+EVENT_UNCORE_CLOCK                          0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                          0x01
diff --git a/src/includes/perfmon_interlagos.h b/src/includes/perfmon_interlagos.h
index d28bb18..b922ce2 100644
--- a/src/includes/perfmon_interlagos.h
+++ b/src/includes/perfmon_interlagos.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header file of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,219 +30,273 @@
  */
 
 #include <perfmon_interlagos_events.h>
-#include <perfmon_interlagos_groups.h>
 #include <perfmon_interlagos_counters.h>
+#include <error.h>
 
 static int perfmon_numCountersInterlagos = NUM_COUNTERS_INTERLAGOS;
-static int perfmon_numGroupsInterlagos = NUM_GROUPS_INTERLAGOS;
 static int perfmon_numArchEventsInterlagos = NUM_ARCH_EVENTS_INTERLAGOS;
 
 
-void perfmon_init_interlagos(PerfmonThread *thread)
+int perfmon_init_interlagos(int cpu_id)
+{
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    return 0;
+}
+
+int ilg_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, 0x0ULL);
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
+
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x20)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
     {
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL3, 0x0ULL);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
     }
-
-    //flags |= (1<<16);  /* user mode flag */
-    /*msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, flags);*/
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_interlagos(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int ilg_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
-    uint64_t flags;
-    uint64_t reg = interlagos_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    /* only one thread accesses Uncore */
-    if ( (interlagos_counter_map[index].type == UNCORE) &&
-            !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        return;
+        return 0;
     }
 
-    flags = (1<<16);
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
-
-    if (perfmon_verbose)
+    if (flags != currentConfig[cpu_id][index])
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
     }
-
-    msr_write(cpu_id, reg , flags);
+    return 0;
 }
 
 
-void perfmon_startCountersThread_interlagos(int thread_id)
+int perfmon_setupCounterThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            if (interlagos_counter_map[i].type == PMC)
-            {
-                msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
-                flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
-                flags |= (1<<22);  /* enable flag */
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch(type)
+        {
+            case PMC:
+                ilg_pmc_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                ilg_uncore_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+    }
+    return 0;
+}
 
-                if (perfmon_verbose) 
-                {
-                    printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                }
 
-                msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
+int perfmon_startCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
             }
-            else if ( interlagos_counter_map[i].type == UNCORE )
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            uint32_t reg = counter_map[index].configRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            if (type == PMC || ((type == UNCORE) && (haveLock)))
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
-                    flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
-                    flags |= (1<<22);  /* enable flag */
-
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST interlagos_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-
-                    msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-                }
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags |= (1<<22);  /* enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
             }
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_interlagos(int thread_id)
+int perfmon_stopCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( interlagos_counter_map[i].type == PMC )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                flags = msr_read(cpu_id,interlagos_counter_map[i].configRegister);
-                flags &= ~(1<<22);  /* clear enable flag */
-                msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-
-                if (perfmon_verbose)
-                {
-                    printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                    printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
-                }
-
+                continue;
             }
-            else if (interlagos_counter_map[i].type == UNCORE)
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            uint32_t reg = counter_map[index].configRegister;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
                     flags &= ~(1<<22);  /* clear enable flag */
-                    msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-
-                    if (perfmon_verbose)
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    break;
+                case UNCORE:
+                    if (haveLock)
                     {
-                        printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST interlagos_counter_map[i].configRegister,
-                                LLU_CAST flags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                        flags &= ~(1<<22);  /* clear enable flag */
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
                     }
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-                }
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
 
-void perfmon_readCountersThread_interlagos(int thread_id)
+int perfmon_readCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t tmp;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-
-    for (int i=0;i<NUM_COUNTERS_INTERLAGOS;i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( interlagos_counter_map[i].type == UNCORE )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if ( haveLock )
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_PMC);
+                    break;
+                case UNCORE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_UNCORE);
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
+
+int perfmon_finalizeCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_interlagos_counters.h b/src/includes/perfmon_interlagos_counters.h
index a593f5a..5f7ac2f 100644
--- a/src/includes/perfmon_interlagos_counters.h
+++ b/src/includes/perfmon_interlagos_counters.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Counter Header File of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,18 +32,24 @@
 #define NUM_COUNTERS_INTERLAGOS 10
 #define NUM_COUNTERS_CORE_INTERLAGOS 6
 
-static PerfmonCounterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
+#define ILG_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
     /* Core counters */
-    {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0},
-    {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0},
-    {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0},
-    {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0},
-    {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0},
+    {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0, ILG_VALID_OPTIONS_PMC},
     /* Northbridge counters */
     {"UPMC0",PMC6, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0}
+    {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL1, MSR_AMD15_NB_PMC1, 0, 0},
+    {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL2, MSR_AMD15_NB_PMC2, 0, 0},
+    {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL3, MSR_AMD15_NB_PMC3, 0, 0}
 };
 
+static BoxMap interlagos_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48},
+    [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 1fa0a44..3a79497 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_interlagos_events.txt
-# 
+#
 #      Description:  Event list for AMD Interlagos
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -131,17 +132,23 @@ EVENT_UNIFIED_TLB_HIT       0x45    PMC0|PMC1|PMC2
 UMASK_UNIFIED_TLB_HIT_4KB_DATA       0x01
 UMASK_UNIFIED_TLB_HIT_2MB_DATA       0x02
 UMASK_UNIFIED_TLB_HIT_1GB_DATA       0x04
+UMASK_UNIFIED_TLB_HIT_ANY_DATA       0x07
 UMASK_UNIFIED_TLB_HIT_4KB_INSTR      0x10
 UMASK_UNIFIED_TLB_HIT_2MB_INSTR      0x20
 UMASK_UNIFIED_TLB_HIT_1GB_INSTR      0x40
+UMASK_UNIFIED_TLB_HIT_ANY_INSTR      0x70
+UMASK_UNIFIED_TLB_HIT_ANY            0x77
 
 EVENT_UNIFIED_TLB_MISS       0x46    PMC0|PMC1|PMC2
 UMASK_UNIFIED_TLB_MISS_4KB_DATA       0x01
 UMASK_UNIFIED_TLB_MISS_2MB_DATA       0x02
 UMASK_UNIFIED_TLB_MISS_1GB_DATA       0x04
+UMASK_UNIFIED_TLB_MISS_ANY_DATA       0x07
 UMASK_UNIFIED_TLB_MISS_4KB_INSTR      0x10
 UMASK_UNIFIED_TLB_MISS_2MB_INSTR      0x20
 UMASK_UNIFIED_TLB_MISS_1GB_INSTR      0x40
+UMASK_UNIFIED_TLB_MISS_ANY_INSTR      0x70
+UMASK_UNIFIED_TLB_MISS_ANY            0x77
 
 EVENT_MISALIGNED_ACCESS       0x47    PMC
 UMASK_MISALIGNED_ACCESS       0x00
@@ -230,6 +237,7 @@ EVENT_ITLB_L1_MISS_L2_MISS        0x085     PMC0|PMC1|PMC2
 UMASK_ITLB_L1_MISS_L2_MISS_4KB         0x01
 UMASK_ITLB_L1_MISS_L2_MISS_2MB         0x02
 UMASK_ITLB_L1_MISS_L2_MISS_1GB         0x04
+UMASK_ITLB_L1_MISS_L2_MISS_ANY         0x07
 
 EVENT_PIPELINE_RESTART_DUE_TO_ISB        0x086     PMC0|PMC1|PMC2
 UMASK_PIPELINE_RESTART_DUE_TO_ISB         0x00
@@ -387,6 +395,14 @@ UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_MEM          0x92
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_IO           0x91
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_CPU_IO          0x64
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_IO_IO           0x61
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_MEM            0xB8
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_IO             0xB4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_MEM             0xB2
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_IO              0xA1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_CPU_IO             0xE4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_IO_IO              0xE1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_CPU_IO               0xF4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_IO_IO                0xF1
 
 EVENT_UNC_CACHE_BLOCK_COMMANDS                             0x0EA     UPMC
 UMASK_UNC_CACHE_BLOCK_COMMANDS_VICTIM_BLOCK                0x01
@@ -420,21 +436,97 @@ UMASK_UNC_GART_EVENTS_MISS              0x04
 UMASK_UNC_GART_EVENTS_REQUEST_WALK      0x08
 UMASK_UNC_GART_EVENTS_MULTIPLE_WALK     0x80
 
-EVENT_UNC_LINK_TRANSMIT_BW_L0         0x0F6     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L0_USE     0x17
+EVENT_UNC_LINK_TRANSMIT_BW_L0           0x0F6     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L0_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L0_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L1         0x0F7     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L1_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L1           0x0F7     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L1_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L1_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L2         0x0F8     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L2_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L2           0x0F8     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L2_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L2_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L3         0x1F9     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L3_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L3           0x1F9     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L3_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L3_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CRC     0xA0
 
 EVENT_UNC_CPU_TO_DRAM             0x1E0     UPMC
 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_0  0x01
diff --git a/src/includes/perfmon_ivybridge.h b/src/includes/perfmon_ivybridge.h
index 0615c27..19e03d9 100644
--- a/src/includes/perfmon_ivybridge.h
+++ b/src/includes/perfmon_ivybridge.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_ivybridge.h
  *
- *      Description:  Header File of perfmon module for Ivy Bridge.
+ *      Description:  Header File of perfmon module for Intel Ivy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,777 +29,1490 @@
  * =======================================================================================
  */
 
+
 #include <perfmon_ivybridge_events.h>
-#include <perfmon_ivybridge_groups.h>
 #include <perfmon_ivybridge_counters.h>
-
-
+#include <perfmon_ivybridgeEP_events.h>
+#include <perfmon_ivybridgeEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+
+static int perfmon_numCountersIvybridgeEP = NUM_COUNTERS_IVYBRIDGEEP;
+static int perfmon_numCoreCountersIvybridgeEP = NUM_COUNTERS_CORE_IVYBRIDGEEP;
+static int perfmon_numArchEventsIvybridgeEP = NUM_ARCH_EVENTS_IVYBRIDGEEP;
 static int perfmon_numCountersIvybridge = NUM_COUNTERS_IVYBRIDGE;
-static int perfmon_numGroupsIvybridge = NUM_GROUPS_IVYBRIDGE;
+static int perfmon_numCoreCountersIvybridge = NUM_COUNTERS_CORE_IVYBRIDGE;
 static int perfmon_numArchEventsIvybridge = NUM_ARCH_EVENTS_IVYBRIDGE;
 
-#define OFFSET_PMC 3
+int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int ivbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*ivy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
+
+int perfmon_init_ivybridge(int cpu_id)
+{
+    int ret;
+    uint64_t data = 0x0ULL;
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL);
+    ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, &data);
+    ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+    if ((cpuid_info.model == IVYBRIDGE_EP))
+    {
+        ivy_cbox_setup = ivbep_cbox_setup;
+    }
+    else if ((ret == 0) && (data == 0x0ULL))
+    {
+        ivy_cbox_setup = ivb_cbox_setup;
+    }
+    return 0;
+}
+
+
+uint32_t ivb_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    flags |= (1ULL<<(1+(index*4)));
+    for(int j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+
+int ivb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    uint64_t offcore_flags = 0x0ULL;
+    flags = (1ULL<<22)|(1ULL<<16);
+
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFF);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0UL;
+    uint64_t filter = 0x0UL;
+    uint32_t reg = counter_map[index].configRegister;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    filter = (event->options[j].value & 0x3FULL);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, flags, SETUP_OPCODE_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, filter));
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+                    filter = (((event->options[j].value>>32) & 0x3FFFULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, reg, flags, SETUP_BBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_pci_box_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device, counter_map[index].configRegister,
+                            flags, SETUP_BOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+                                         counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_mboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22);
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device,
+            counter_map[index].configRegister, flags, SETUP_MBOXFIX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+            counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    uint64_t flags = 0x0UL;
+    uint32_t filterreg = 0x0U;
+    uint64_t filterval = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    PciDeviceIndex dev = counter_map[index].device;
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits != 0x0)
+    {
+        flags = (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK1:
+                    if (HPMcheck(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MASK_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_SBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    uint64_t mask = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    uint64_t mask = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        RegisterType type = counter_map[index].type;
+        uint64_t filter0 = 0x0ULL;
+        uint64_t filter1 = 0x0ULL;
+        int state_set = 0;
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (1<<19);
+                    filter0 |= (event->options[j].value & 0x1FULL);
+                    break;
+                case EVENT_OPTION_STATE:
+                    filter0 |= ((event->options[j].value & 0x3FULL) << 17);
+                    state_set = 1;
+                    break;
+                case EVENT_OPTION_NID:
+                    mask = 0x0ULL;
+                    for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+                        mask |= (1ULL<<i);
+
+                    if (event->options[j].value & mask)
+                    {
+                        filter1 |= (event->options[j].value & 0xFFFFULL);
+                    }
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    filter1 |= ((event->options[j].value & 0x1FFULL) << 20);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter1 |= ((event->options[j].value & 0x3) << 30);
+                    break;
+                default:
+                    break;
+            }
+        }
+        if (state_set == 0 && event->eventId == 0x34)
+        {
+            filter0 |= (0x1FULL<<17);
+        }
+        if (filter0 != 0x0ULL)
+        {
+            VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, filter0, SETUP_CBOX_FILTER0);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, filter0));
+        }
+        if (filter1 != 0x0ULL)
+        {
+            VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, filter1, SETUP_CBOX_FILTER1);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, filter1));
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    if (cpuid_info.model == IVYBRIDGE_EP)
+    {
+        flags |= (1ULL<<17);
+    }
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1F) << 24);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    if (cpuid_info.model == IVYBRIDGE)
+                    {
+                        flags |= (1ULL<<23);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_uboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOXFIX)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= event->eventId;
+    if (event->cfgBits != 0x0)
+    {
+        flags |= ((event->cfgBits & 0x1) << 21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        RegisterType type = counter_map[index].type;
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1F) << 24);
+                    break;
+                case EVENT_OPTION_OCCUPANCY:
+                    flags |= ((event->options[j].value & 0x3) << 14);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_INVERT:
+                    flags |= (1ULL<<30);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_EDGE:
+                    flags |= (1ULL<<31);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1,
+                                    event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                    box_map[type].filterRegister1,
+                                    event->options[j].value & 0xFFFFFFFFULL));
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int ivb_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+
+int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    uint32_t freeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        VERBOSEPRINTREG(cpu_id, freeze_reg, LLU_CAST (1ULL<<31), FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, freeze_reg, (1ULL<<31)));
+    }
+    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for (int j=UNCORE; j<NUM_UNITS; j++)
+        {
+            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
+            {
+                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+                                                    box_map[j].ctrlRegister, flags));
+                }
+                else if (box_map[j].ctrlRegister != 0x0)
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[j].ctrlRegister, flags));
+                }
+            }
+        }
+    }
+    return 0;
+}
 
-void perfmon_init_ivybridge(PerfmonThread *thread)
+int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* TODO Robust implementation which also works if stuff is not there */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        if ( cpuid_info.model == IVYBRIDGE_EP )
+    uint32_t unfreeze_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_CTL : MSR_UNC_PERF_GLOBAL_CTRL);
+    uint32_t ovf_reg = (cpuid_info.model == IVYBRIDGE_EP ? MSR_UNC_U_PMON_GLOBAL_STATUS : MSR_UNC_PERF_GLOBAL_OVF_CTRL);
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for (int j=UNCORE; j<NUM_UNITS; j++)
         {
-            /* Only root can access pci address space in direct mode */
-            if (accessClient_mode != DAEMON_AM_DIRECT)
+            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
             {
-                uint32_t  uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                uflags = 0x0U;
-                uflags |= (1<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_3, uflags);
-
-                uflags |= (1<<19);  /* reset fixed counter */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                /* iMC counters need to be manually reset to zero */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
-#if 0
-                /* FIXME: Not yet tested/ working due to BIOS issues on test
-                 * machines */
-
-                /* QPI registers can be zeroed with single write */
-                uflags = 0x0103UL; /* freeze (bit 8), reset */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                uflags = 0x0UL;
-                uflags |= (1UL<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-
-
-                /* Cbo counters */
-                uflags = 0xF0103UL; /*enable freeze (bit 8), reset */
-                msr_write(cpu_id, MSR_UNC_C0_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C1_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C2_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C3_PMON_BOX_CTL, uflags);
-
-                switch ( cpuid_topology.numCoresPerSocket )
+                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+                                                    box_map[j].ctrlRegister, flags));
+                }
+                else if (box_map[j].ctrlRegister != 0x0)
                 {
-                    case 12:
-                        msr_write(cpu_id, MSR_UNC_C11_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C10_PMON_BOX_CTL, uflags);
-                    case 10:
-                        msr_write(cpu_id, MSR_UNC_C9_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C8_PMON_BOX_CTL, uflags);
-                    case 8:
-                        msr_write(cpu_id, MSR_UNC_C7_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C6_PMON_BOX_CTL, uflags);
-                    case 6:
-                        msr_write(cpu_id, MSR_UNC_C5_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C4_PMON_BOX_CTL, uflags);
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[j].ctrlRegister, flags));
                 }
-#endif
             }
         }
     }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        VERBOSEPRINTREG(cpu_id, ovf_reg, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, ovf_reg, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, unfreeze_reg, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, unfreeze_reg, (1ULL<<29)));
+    }
+    return 0;
 }
 
-#define BOX_GATE_IVB(channel,label) \
-    if (perfmon_verbose) { \
-        printf("[%d] perfmon_setup_counter (##label): Write Register 0x%llX , Flags: 0x%llX \n", \
-                cpu_id, \
-                LLU_CAST reg, \
-                LLU_CAST flags); \
-    } \
-    if(haveLock) { \
-        uflags = (1UL<<22);\
-        uflags |= (event->umask<<8) + event->eventId;  \
-        if (event->cfgBits == 0xFF) \
-        { \
-            uflags |= (1<<21); \
-        } \
-        pci_write(cpu_id, channel,  reg, uflags);  \
-    }
-
-
-void perfmon_setupCounterThread_ivybridge(
+
+int perfmon_setupCounterThread_ivybridge(
         int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+        PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags;
-    uint32_t uflags;
-    uint64_t reg = ivybridge_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    switch (ivybridge_counter_map[index].type)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        case PMC:
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
 
-            //flags = msr_read(cpu_id,reg);
-            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
-            flags = (1<<22)|(1<<16);
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (eventSet->events[i].type)
+        {
+            case PMC:
+                ivb_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= ivb_fixed_setup(cpu_id, index, event);
+                break;
+
+            case POWER:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+            case MBOX4:
+            case MBOX5:
+            case MBOX6:
+            case MBOX7:
+            case PBOX:
+            case RBOX0:
+            case RBOX1:
+                ivb_pci_box_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                ivb_bbox_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0FIX:
+            case MBOX1FIX:
+            case MBOX2FIX:
+            case MBOX3FIX:
+            case MBOX4FIX:
+            case MBOX5FIX:
+            case MBOX6FIX:
+            case MBOX7FIX:
+                ivb_mboxfix_setup(cpu_id, index, event);
+                break;
+
+            case SBOX0:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case SBOX1:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+            case SBOX2:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_2);
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+            case CBOX10:
+            case CBOX11:
+            case CBOX12:
+            case CBOX13:
+            case CBOX14:
+                ivy_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                ivb_ubox_setup(cpu_id, index, event);
+                break;
+            case UBOXFIX:
+                ivb_uboxfix_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                ivb_wbox_setup(cpu_id, index, event);
+                break;
+
+            case IBOX0:
+            case IBOX1:
+                ivb_ibox_setup(cpu_id, index, event);
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+int perfmon_startCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-            if (perfmon_verbose)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                continue;
             }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
+            {
+                case PMC:
+                    if (eventSet->regTypeMask & REG_TYPE_MASK(PMC))
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                        fixed_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    }
+                    break;
+
+                case FIXED:
+                    if (eventSet->regTypeMask & REG_TYPE_MASK(FIXED))
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                        fixed_flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    }
+                    break;
 
-            msr_write(cpu_id, reg , flags);
-            break;
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST field64(tmp, 0, box_map[type].regWidth), START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+        }
+    }
 
-        case FIXED:
-            fixed_flags |= (0x2ULL<<(index*4));
-            break;
+    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
 
-        case POWER:
-            break;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST fixed_flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, fixed_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|fixed_flags));
+    }
+    return 0;
+}
 
-        case MBOX0:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_0,MBOX0);
-            break;
 
-        case MBOX1:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_1,MBOX1);
-            break;
 
-        case MBOX2:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_2,MBOX2);
-            break;
+uint64_t ivb_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event, int flags)
+{
+    uint64_t result = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    uint64_t counter1 = counter_map[index].counterRegister;
+    uint64_t counter2 = counter_map[index].counterRegister2;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return result;
+    }
+    if (box_map[type].isPci && !HPMcheck(dev, cpu_id))
+    {
+        return result;
+    }
 
-        case MBOX3:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_3,MBOX3);
-            break;
+    CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, tmp, UNCORE_READ);
 
-        case SBOX0:
+    if (flags & FREEZE_FLAG_CLEAR_CTR)
+    {
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+    }
+    if (counter2 != 0x0)
+    {
+        result = (tmp<<32);
+        tmp = 0x0ULL;
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter2, tmp, UNCORE_READ);
+        result += (tmp & 0xFFFFFFFF);
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+        }
+    }
+    else
+    {
+        result = tmp;
+    }
+    return result;
+}
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
-            {
-                if(haveLock)
-                {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1UL<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    printf("UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    printf("MATCH UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    printf("MASK UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
-                }
-            }
-            else
+int ivb_uncore_overflow(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+                         int* overflows, uint64_t result, uint64_t cur_result,
+                         int global_offset, int box_offset)
+{
+    int test_local = 0;
+    uint64_t ovf_values = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (result < cur_result)
+    {
+        if (global_offset != -1)
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                           MSR_UNC_U_PMON_GLOBAL_STATUS,
+                                           &ovf_values));
+            if (ovf_values & (1<<global_offset))
             {
-                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                 MSR_UNC_U_PMON_GLOBAL_STATUS,
+                                                 (1<<global_offset)));
+                test_local = 1;
             }
+        }
+        else
+        {
+            test_local = 1;
+        }
 
-            break;
-
-        case SBOX1:
-
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
+        if (test_local)
+        {
+            ovf_values = 0x0ULL;
+            if (ivybridge_box_map[type].isPci)
             {
-                if(haveLock)
-                {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1UL<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
-                }
+                CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
             }
             else
             {
-                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
             }
-            break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-        case CBOX8:
-        case CBOX9:
-        case CBOX10:
-        case CBOX11:
-
-            if(haveLock)
+            if (ovf_values & (1<<box_offset))
             {
-                perfmon_threadData[thread_id].counters[index].init = TRUE;
-                uflags = 0x0U;
-
-                /* set local enable flag */
-                uflags |= 1<<22;
-                /* Intel with standard 8 bit event mask: [7:0] */
-                uflags |= (event->umask<<8) + event->eventId;
-                msr_write(cpu_id, reg , uflags);
-
-                if (perfmon_verbose)
+                (*overflows)++;
+                if (ivybridge_box_map[type].isPci)
+                {
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+                                                    box_map[type].statusRegister,
+                                                    (1<<box_offset)));
+                }
+                else
                 {
-                    printf("[%d] perfmon_setup_counter: Write Register 0x%llX , uFlags: 0x%lX \n",
-                            cpu_id,
-                            LLU_CAST reg,
-                            (unsigned long) uflags);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[type].statusRegister,
+                                                     (1<<box_offset)));
                 }
             }
-            break;
-
-        default:
-            /* should never be reached */
-            break;
-    }
-    if (fixed_flags != orig_fixed_flags)
-    {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        }
     }
+    return 0;
 }
 
-#define CBOX_START(NUM) \
-if(haveLock) { \
-    msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags);  \
-}
-
-#define MBOX_START(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM,  PCI_UNC_MC_PMON_BOX_CTL, uflags); \
-}
-
-
-
-void perfmon_startCountersThread_ivybridge(int thread_id)
+int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
+    uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
 
-    for ( int i=0; i<perfmon_numCountersIvybridge; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (ivybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result= 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
                     break;
-
                 case FIXED:
-                    msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<(index+32)))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
                     }
-
                     break;
 
-                case MBOX0:
-                    MBOX_START(0);
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
-                case MBOX1:
-                    MBOX_START(1);
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case SBOX2FIX:
+                    if (haveLock && HPMcheck(dev, cpu_id))
+                    {
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+                        switch (extractBitField(counter_result,3,0))
+                        {
+                            case 0x2:
+                                counter_result = 5600000000ULL;
+                                break;
+                            case 0x3:
+                                counter_result = 6400000000ULL;
+                                break;
+                            case 0x4:
+                                counter_result = 7200000000ULL;
+                                break;
+                            case 0x5:
+                                counter_result = 8000000000ULL;
+                                break;
+                            case 0x6:
+                                counter_result = 8800000000ULL;
+                                break;
+                            case 0x7:
+                                counter_result = 9600000000ULL;
+                                break;
+                            default:
+                                counter_result = 0x0ULL;
+                                break;
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+                    }
                     break;
 
+                case MBOX0:
+                case MBOX1:
                 case MBOX2:
-                    MBOX_START(2);
-                    break;
-
                 case MBOX3:
-                    MBOX_START(3);
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
                     break;
 
-                case MBOXFIX:
-                    break;
 
-                case SBOX0:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
-                    }
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, 0);
                     break;
 
-                case SBOX1:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
-                    }
+                case IBOX1:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, -1, getCounterTypeOffset(index)+2);
                     break;
 
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
                 case CBOX0:
-                    CBOX_START(0);
-                    break;
-
                 case CBOX1:
-                    CBOX_START(1);
-                    break;
-
                 case CBOX2:
-                    CBOX_START(2);
-                    break;
-
                 case CBOX3:
-                    CBOX_START(3);
-                    break;
-
                 case CBOX4:
-                    CBOX_START(4);
-                    break;
-
                 case CBOX5:
-                    CBOX_START(5);
-                    break;
-
                 case CBOX6:
-                    CBOX_START(6);
-                    break;
-
                 case CBOX7:
-                    CBOX_START(7);
-                    break;
-
                 case CBOX8:
-                    CBOX_START(8);
-                    break;
-
                 case CBOX9:
-                    CBOX_START(9);
-                    break;
-
                 case CBOX10:
-                    CBOX_START(10);
-                    break;
-
                 case CBOX11:
-                    CBOX_START(11);
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case UBOX:
+                case UBOXFIX:
+                case BBOX0:
+                case BBOX1:
+                case WBOX:
+                case PBOX:
+                case RBOX0:
+                case RBOX1:
+                case RBOX2:
+                case IBOX0:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
-}
-
-#define CBOX_STOP(NUM) \
-if(haveLock) { \
-    msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags);  \
-    perfmon_threadData[thread_id].counters[i].counterData =   \
-    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);    \
-}
-
-#define MBOX_STOP(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM ,  PCI_UNC_MC_PMON_BOX_CTL, uflags); \
-    counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister); \
-    counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister2);  \
-    perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
-}
-
-#define SBOX_STOP(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_##NUM ,  PCI_UNC_QPI_PMON_BOX_CTL, (1<<8)); \
-    counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister); \
-    counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister2);  \
-    perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
+    return 0;
 }
 
-
-void perfmon_stopCountersThread_ivybridge(int thread_id)
+int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
 
-    for ( int i=0; i < NUM_COUNTERS_IVYBRIDGE; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (ivybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
             {
                 case PMC:
-
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    break;
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<(index+32)))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, ivybridge_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
-                case MBOX0:
-                    MBOX_STOP(0);
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case SBOX2FIX:
+                    if (haveLock && HPMcheck(dev, cpu_id))
+                    {
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+                        switch (extractBitField(counter_result,3,0))
+                        {
+                            case 0x2:
+                                counter_result = 5600000000ULL;
+                                break;
+                            case 0x3:
+                                counter_result = 6400000000ULL;
+                                break;
+                            case 0x4:
+                                counter_result = 7200000000ULL;
+                                break;
+                            case 0x5:
+                                counter_result = 8000000000ULL;
+                                break;
+                            case 0x6:
+                                counter_result = 8800000000ULL;
+                                break;
+                            case 0x7:
+                                counter_result = 9600000000ULL;
+                                break;
+                            default:
+                                counter_result = 0x0ULL;
+                                break;
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+                        eventSet->events[i].threadCounter[thread_id].startData = 0;
+                    }
                     break;
 
+                case MBOX0:
                 case MBOX1:
-                    MBOX_STOP(1);
-                    break;
-
                 case MBOX2:
-                    MBOX_STOP(2);
-                    break;
-
                 case MBOX3:
-                    MBOX_STOP(3);
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
                     break;
 
-                case MBOXFIX:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                ivybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    ivybridge_counter_map[i].counterRegister2);
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, 0);
                     break;
 
-                case SBOX0:
-                    SBOX_STOP(0);
+                case IBOX1:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, -1, getCounterTypeOffset(index)+2);
                     break;
 
+                case SBOX0:
                 case SBOX1:
-                    SBOX_STOP(1);
-                    break;
-
+                case SBOX2:
                 case CBOX0:
-                    CBOX_STOP(0);
-                    break;
-
                 case CBOX1:
-                    CBOX_STOP(1);
-                    break;
-
                 case CBOX2:
-                    CBOX_STOP(2);
-                    break;
-
                 case CBOX3:
-                    CBOX_STOP(3);
-                    break;
-
                 case CBOX4:
-                    CBOX_STOP(4);
-                    break;
-
                 case CBOX5:
-                    CBOX_STOP(5);
-                    break;
-
                 case CBOX6:
-                    CBOX_STOP(6);
-                    break;
-
                 case CBOX7:
-                    CBOX_STOP(7);
-                    break;
-
                 case CBOX8:
-                    CBOX_STOP(8);
-                    break;
-
                 case CBOX9:
-                    CBOX_STOP(9);
-                    break;
-
                 case CBOX10:
-                    CBOX_STOP(10);
-                    break;
-
                 case CBOX11:
-                    CBOX_STOP(11);
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case UBOX:
+                case UBOXFIX:
+                case BBOX0:
+                case BBOX1:
+                case WBOX:
+                case PBOX:
+                case RBOX0:
+                case RBOX1:
+                case RBOX2:
+                case IBOX0:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
                     break;
 
-
                 default:
                     /* should never be reached */
                     break;
             }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf ("Overflow occured \n");
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
     }
+    return 0;
 }
 
-void perfmon_readCountersThread_ivybridge(int thread_id)
+
+int perfmon_finalizeCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
-    for ( int i=0; i<NUM_COUNTERS_IVYBRIDGE; i++ )
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((ivybridge_counter_map[i].type == PMC) || (ivybridge_counter_map[i].type == FIXED))
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                continue;
             }
-            else
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+
+            switch(type)
             {
-                if(haveLock)
-                {
-                    switch (ivybridge_counter_map[i].type)
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
-                            break;
-
-                        case MBOX0:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX1:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX2:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX3:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        default:
-                            /* should never be reached */
-                            break;
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
                     }
-                }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
             }
+            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
-}
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_STATUS, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_STATUS, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, 0x0ULL));
+    }
 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_ivybridgeEP_counters.h b/src/includes/perfmon_ivybridgeEP_counters.h
new file mode 100644
index 0000000..896530c
--- /dev/null
+++ b/src/includes/perfmon_ivybridgeEP_counters.h
@@ -0,0 +1,316 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_ivybridgeEP_counters.h
+ *
+ *      Description: Counter header file of perfmon module for Intel Ivy Bridge EP.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_CORE_IVYBRIDGEEP 8
+#define NUM_COUNTERS_UNCORE_IVYBRIDGEEP 81
+#define NUM_COUNTERS_IVYBRIDGEEP 161
+
+#define IVBEP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|\
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define IVBEP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_TID_MASK|EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK|\
+            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVBEP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define IVBEP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK|\
+            EVENT_OPTION_MASK0_MASK
+#define IVBEP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVBEP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVBEP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+
+static RegisterMap ivybridgeEP_counter_map[NUM_COUNTERS_IVYBRIDGEEP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVBEP_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVBEP_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* CBOX counters, 44bits wide*/
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC44, CBOX8, MSR_UNC_C8_PMON_CTL0, MSR_UNC_C8_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC45, CBOX8, MSR_UNC_C8_PMON_CTL1, MSR_UNC_C8_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC46, CBOX8, MSR_UNC_C8_PMON_CTL2, MSR_UNC_C8_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC47, CBOX8, MSR_UNC_C8_PMON_CTL3, MSR_UNC_C8_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC48, CBOX9, MSR_UNC_C9_PMON_CTL0, MSR_UNC_C9_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC49, CBOX9, MSR_UNC_C9_PMON_CTL1, MSR_UNC_C9_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC50, CBOX9, MSR_UNC_C9_PMON_CTL2, MSR_UNC_C9_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC51, CBOX9, MSR_UNC_C9_PMON_CTL3, MSR_UNC_C9_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC52, CBOX10, MSR_UNC_C10_PMON_CTL0, MSR_UNC_C10_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC53, CBOX10, MSR_UNC_C10_PMON_CTL1, MSR_UNC_C10_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC54, CBOX10, MSR_UNC_C10_PMON_CTL2, MSR_UNC_C10_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC55, CBOX10, MSR_UNC_C10_PMON_CTL3, MSR_UNC_C10_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC56, CBOX11, MSR_UNC_C11_PMON_CTL0, MSR_UNC_C11_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC57, CBOX11, MSR_UNC_C11_PMON_CTL1, MSR_UNC_C11_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC58, CBOX11, MSR_UNC_C11_PMON_CTL2, MSR_UNC_C11_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC59, CBOX11, MSR_UNC_C11_PMON_CTL3, MSR_UNC_C11_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC60, CBOX12, MSR_UNC_C12_PMON_CTL0, MSR_UNC_C12_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC61, CBOX12, MSR_UNC_C12_PMON_CTL1, MSR_UNC_C12_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC62, CBOX12, MSR_UNC_C12_PMON_CTL2, MSR_UNC_C12_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC63, CBOX12, MSR_UNC_C12_PMON_CTL3, MSR_UNC_C12_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC64, CBOX13, MSR_UNC_C13_PMON_CTL0, MSR_UNC_C13_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC65, CBOX13, MSR_UNC_C13_PMON_CTL1, MSR_UNC_C13_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC66, CBOX13, MSR_UNC_C13_PMON_CTL2, MSR_UNC_C13_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC67, CBOX13, MSR_UNC_C13_PMON_CTL3, MSR_UNC_C13_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC68, CBOX14, MSR_UNC_C14_PMON_CTL0, MSR_UNC_C14_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC69, CBOX14, MSR_UNC_C14_PMON_CTL1, MSR_UNC_C14_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC70, CBOX14, MSR_UNC_C14_PMON_CTL2, MSR_UNC_C14_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC71, CBOX14, MSR_UNC_C14_PMON_CTL3, MSR_UNC_C14_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_CBOX},
+    /* Uncore management Counters: 2 48bit wide counters */
+    {"UBOX0", PMC72, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC73, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC74, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, 0},
+    /* PCU Counters: 4 48bit wide counters */
+    {"WBOX0", PMC75, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC76, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC77, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC78, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, IVBEP_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC79, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC80, WBOX1FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
+    {"MBOX0C0",PMC81, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC82, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC83, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC84, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX",PMC85, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX1C0",PMC86, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC87, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC88, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC89, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX",PMC90, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX2C0",PMC91, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C1",PMC92, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C2",PMC93, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C3",PMC94, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX",PMC95, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX3C0",PMC96, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C1",PMC97, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C2",PMC98, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C3",PMC99, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX",PMC100, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+    {"MBOX4C0",PMC101, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX4C1",PMC102, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX4C2",PMC103, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX4C3",PMC104, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX4FIX",PMC105, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX5C0",PMC106, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX5C1",PMC107, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX5C2",PMC108, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX5C3",PMC109, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX5FIX",PMC110, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX6C0",PMC111, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX6C1",PMC112, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX6C2",PMC113, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX6C3",PMC114, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX6FIX",PMC115, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX7C0",PMC116, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX7C1",PMC117, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX7C2",PMC118, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX7C3",PMC119, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, IVBEP_VALID_OPTIONS_MBOX},
+    {"MBOX7FIX",PMC120, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_NONE_MASK},
+    /* QPI counters four 48bit wide per port, split in two reads */
+    {"SBOX0C0",PMC121, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC122, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC123, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC124, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC125, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC126, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC127, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC128, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX2C0",PMC129, SBOX2, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX2C1",PMC130, SBOX2, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX2C2",PMC131, SBOX2, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX2C3",PMC132, SBOX2, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_2, IVBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0FIX",PMC133, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX1FIX",PMC134, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX2FIX",PMC135, SBOX2FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+    /* HA counters four 48bit wide per counter, split in two reads */
+    {"BBOX0C0", PMC136, BBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC137, BBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC138, BBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC139, BBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC140, BBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC141, BBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC142, BBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC143, BBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, IVBEP_VALID_OPTIONS_BBOX},
+    /* R2PCIe counters four 44bit wide per counter, split in two reads */
+    {"PBOX0", PMC144, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC145, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC146, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC147, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, IVBEP_VALID_OPTIONS_PBOX},
+    /* R3QPI counters four 44bit wide per counter, split in two reads */
+    {"RBOX0C0", PMC148, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC149, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC150, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC151, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC152, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC153, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX2C0", PMC154, RBOX2, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX2C1", PMC155, RBOX2, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+    {"RBOX2C2", PMC156, RBOX2, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_2, IVBEP_VALID_OPTIONS_RBOX},
+    /* IRP counters four 44bit wide per counter */
+    {"IBOX0C0", PMC157, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC158, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC159, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC160, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVBEP_VALID_OPTIONS_IBOX},
+};
+
+static BoxMap ivybridgeEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX0FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX1FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX2FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX3FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX4FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX5FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX6FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [MBOX7FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 3, 0, 0, 44, MSR_UNC_C0_PMON_BOX_FILTER, MSR_UNC_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 4, 0, 0, 44, MSR_UNC_C1_PMON_BOX_FILTER, MSR_UNC_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 5, 0, 0, 44, MSR_UNC_C2_PMON_BOX_FILTER, MSR_UNC_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 6, 0, 0, 44, MSR_UNC_C3_PMON_BOX_FILTER, MSR_UNC_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 7, 0, 0, 44, MSR_UNC_C4_PMON_BOX_FILTER, MSR_UNC_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 8, 0, 0, 44, MSR_UNC_C5_PMON_BOX_FILTER, MSR_UNC_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 9, 0, 0, 44, MSR_UNC_C6_PMON_BOX_FILTER, MSR_UNC_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 10, 0, 0, 44, MSR_UNC_C7_PMON_BOX_FILTER, MSR_UNC_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_C8_PMON_BOX_CTL, 0, 0, 11, 0, 0, 44, MSR_UNC_C8_PMON_BOX_FILTER, MSR_UNC_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_C9_PMON_BOX_CTL, 0, 0, 12, 0, 0, 44, MSR_UNC_C9_PMON_BOX_FILTER, MSR_UNC_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_C10_PMON_BOX_CTL, 0, 0, 13, 0, 0, 44, MSR_UNC_C10_PMON_BOX_FILTER, MSR_UNC_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_C11_PMON_BOX_CTL, 0, 0, 14, 0, 0, 44, MSR_UNC_C11_PMON_BOX_FILTER, MSR_UNC_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_C12_PMON_BOX_CTL, 0, 0, 15, 0, 0, 44, MSR_UNC_C12_PMON_BOX_FILTER, MSR_UNC_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_C13_PMON_BOX_CTL, 0, 0, 16, 0, 0, 44, MSR_UNC_C13_PMON_BOX_FILTER, MSR_UNC_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_C14_PMON_BOX_CTL, 0, 0, 17, 0, 0, 44, MSR_UNC_C14_PMON_BOX_FILTER, MSR_UNC_C14_PMON_BOX_FILTER1},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 18, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 19, 1, PCI_HA_DEVICE_1, 48},
+    [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 22, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 23, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [SBOX2] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
+    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [SBOX2FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_2, 64},
+    [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, MSR_UNC_PCU_PMON_BOX_STATUS, MSR_UNC_PCU_PMON_BOX_STATUS, 2, 0, 0, 48,  MSR_UNC_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0, 0, 0, 0, 0, 0, 64},
+    [WBOX1FIX] = {0, 0, 0, 0, 0, 0, 64},
+    [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 1, 0, 0, 44},
+    [UBOXFIX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, 44},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 26, 1,PCI_R2PCIE_DEVICE, 44},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 24, 1,PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 25, 1,PCI_R3QPI_DEVICE_LINK_1, 44},
+    [RBOX2] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, -1, 1,PCI_R3QPI_DEVICE_LINK_2, 44},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+};
+
+static PciDevice ivybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x0e36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x0e37},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "12.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX2", 0x0e3e},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x0e34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.4", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x0eb4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.5", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x0eb5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x0eb0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x0eb1},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE_0", "BBOX0", 0x0e30},
+ [PCI_HA_DEVICE_1] = {HA, "1c.1", "PCI_HA_DEVICE_1", "BBOX1", 0x0e38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "1e.4", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x0ef4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "1e.5", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x0ef5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "1e.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x0ef0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "1e.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x0ef1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x0e39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x0e32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x0e33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "SBOX2", 0x0e3a},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x0e86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x0e96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x0ec6},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0/1", "SBOX01FIX",0x0e80},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "SBOX2FIX", 0x0ec0},
+};
diff --git a/src/includes/perfmon_ivybridgeEP_events.txt b/src/includes/perfmon_ivybridgeEP_events.txt
new file mode 100644
index 0000000..e71e1cf
--- /dev/null
+++ b/src/includes/perfmon_ivybridgeEP_events.txt
@@ -0,0 +1,2072 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_ivybridgeEP_events.txt
+#
+#      Description:  Event list for Intel Ivy Bridge EP/EN/EX
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE               0x00   TMP0
+UMASK_TEMP_CORE               0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY         0x00   PWR3
+UMASK_PWR_DRAM_ENERGY         0x00
+
+EVENT_INSTR_RETIRED           0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY       0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE   0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF    0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x81
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x82
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x84
+
+EVENT_INT_MISC                       0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES       0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT        0x03
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE         0x10
+UMASK_UOPS_ISSUED_SLOW_LEA            0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL          0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES         0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE    0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_FP_COMP_OPS_EXE                          0x10   PMC
+UMASK_FP_COMP_OPS_EXE_X87                      0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
+
+EVENT_SIMD_FP_256_PACKED            0x11   PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
+
+EVENT_ARITH                      0x14   PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV              0x01
+
+EVENT_L2_RQSTS                         0x24   PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD     0x03
+UMASK_L2_RQSTS_RFO_HITS                0x04
+UMASK_L2_RQSTS_RFO_MISS                0x08
+UMASK_L2_RQSTS_RFO_ANY                 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS            0x10
+UMASK_L2_RQSTS_CODE_RD_MISS            0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD        0x30
+UMASK_L2_RQSTS_PF_HIT                  0x40
+UMASK_L2_RQSTS_PF_MISS                 0x80
+UMASK_L2_RQSTS_ALL_PF                  0xC0
+UMASK_L2_RQSTS_MISS                    0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M      0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
+
+EVENT_L1D_WB_RQST                  0x28   PMC
+UMASK_L1D_WB_RQST_HIT_E            0x04
+UMASK_L1D_WB_RQST_HIT_M            0x08
+UMASK_L1D_WB_RQST_ALL              0x0F
+
+EVENT_L3_LAT_CACHE               0x2E   PMC
+UMASK_L3_LAT_CACHE_REFERENCE     0x4F
+UMASK_L3_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
+
+EVENT_L1D_PEND_MISS              0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING      0x01
+
+EVENT_DTLB_STORE_MISSES                      0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK        0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION        0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
+
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_L1D                         0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES               0x5C    PMC
+UMASK_CPL_CYCLES_RING0         0x01
+UMASK_CPL_CYCLES_RING123       0x02
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_CACHE_LOCK_CYCLES                              0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION  0x01
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT     0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION          0x02
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT             0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_ALL_MITE_ALL_UOPS            0x3C
+
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HITS               0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                 0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
+UMASK_ITLB_MISSES_WALK_DURATION   0x04
+UMASK_ITLB_MISSES_STLB_HIT        0x10
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                            0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE                       0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE   0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK           0x01
+
+EVENT_UOPS_DISPATCHED_PORT                  0xA1   PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD        0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA       0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS       0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS        0xFF
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY                               0xA3   PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L2_PENDING          EVENT_OPTION_THRESHOLD=0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_LDM_PENDING  EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING            0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L1D_PENDING         EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_L1D_PENDING                   0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE   EVENT_OPTION_THRESHOLD=0x04
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE             0x04
+
+EVENT_DSB2MITE_SWITCHES                 0xAB   PMC
+UMASK_DSB2MITE_SWITCHES_COUNT           0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES  0x02
+
+EVENT_DSB_FILL                          0xAC   PMC
+UMASK_DSB_FILL_EXCEED_DSB_LINES         0x08
+
+EVENT_ITLB                              0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH                   0x01
+
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_TLB_FLUSH                 0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_INST_RETIRED                  0xC0  PMC1
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_ALL              0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_STORE        0x08
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x20
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL   0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
+UMASK_BR_MISP_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_MISP_RETIRED_TAKEN         0x20
+
+EVENT_FP_ASSIST                     0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT          0x02
+UMASK_FP_ASSIST_X87_INPUT           0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT         0x08
+UMASK_FP_ASSIST_SIMD_INPUT          0x10
+UMASK_FP_ASSIST_ANY                 0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS            0x81
+UMASK_MEM_UOPS_RETIRED_STORES           0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK      0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL          0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED                   0xD2   PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_BACLEARS               0xE6   PMC
+UMASK_BACLEARS_ANY           0x1F
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PREF       0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
+UMASK_L2_LINES_OUT_PF_CLEAN         0x04
+UMASK_L2_LINES_OUT_PF_DIRTY         0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL        0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL        0x05
+UMASK_L2_LINES_OUT_ALL              0x0F
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED                0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM     0x01
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM    0x0C
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM    0x10
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD     0x20
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS                         0x00  CBOX
+UMASK_CBOX_CLOCKTICKS                         0x00
+
+EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1|CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_COUNTER0_OCCUPANCY              0x00
+
+EVENT_LLC_LOOKUP              0x34  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+DEFAULT_OPTIONS_LLC_LOOKUP          EVENT_OPTION_STATE=0x1F
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_NID              EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_LLC_VICTIMS              0x37  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_LLC_VICTIMS_M_STATE      0x01
+UMASK_LLC_VICTIMS_E_STATE      0x02
+UMASK_LLC_VICTIMS_S_STATE      0x04
+UMASK_LLC_VICTIMS_ANY          0x07
+UMASK_LLC_VICTIMS_MISS         0x08
+OPTIONS_LLC_VICTIMS_NID        EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID          0x40
+
+EVENT_CBO_MISC              0x39  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_CBO_MISC_RSPI_WAS_FSE      0x01
+UMASK_CBO_MISC_WC_ALIASING       0x02
+UMASK_CBO_MISC_STARTED           0x04
+UMASK_CBO_MISC_RFO_HIT_S         0x08
+
+EVENT_RING_AD_USED               0x1B  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AD_USED_0_UP_EVEN      0x01
+UMASK_RING_AD_USED_0_UP_ODD       0x02
+UMASK_RING_AD_USED_0_DOWN_EVEN    0x04
+UMASK_RING_AD_USED_0_DOWN_ODD     0x08
+UMASK_RING_AD_USED_1_UP_EVEN      0x10
+UMASK_RING_AD_USED_1_UP_ODD       0x20
+UMASK_RING_AD_USED_1_DOWN_EVEN    0x40
+UMASK_RING_AD_USED_1_DOWN_ODD     0x80
+UMASK_RING_AD_USED_DOWN           0xCC
+UMASK_RING_AD_USED_UP             0x33
+UMASK_RING_AD_USED_ANY            0xFF
+
+EVENT_RING_AK_USED              0x1C  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AK_USED_0_UP_EVEN      0x01
+UMASK_RING_AK_USED_0_UP_ODD       0x02
+UMASK_RING_AK_USED_0_DOWN_EVEN    0x04
+UMASK_RING_AK_USED_0_DOWN_ODD     0x08
+UMASK_RING_AK_USED_1_UP_EVEN      0x10
+UMASK_RING_AK_USED_1_UP_ODD       0x20
+UMASK_RING_AK_USED_1_DOWN_EVEN    0x40
+UMASK_RING_AK_USED_1_DOWN_ODD     0x80
+UMASK_RING_AK_USED_DOWN           0xCC
+UMASK_RING_AK_USED_UP             0x33
+UMASK_RING_AK_USED_ANY            0xFF
+
+EVENT_RING_BL_USED              0x1D  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_BL_USED_0_UP_EVEN      0x01
+UMASK_RING_BL_USED_0_UP_ODD       0x02
+UMASK_RING_BL_USED_0_DOWN_EVEN    0x04
+UMASK_RING_BL_USED_0_DOWN_ODD     0x08
+UMASK_RING_BL_USED_1_UP_EVEN      0x10
+UMASK_RING_BL_USED_1_UP_ODD       0x20
+UMASK_RING_BL_USED_1_DOWN_EVEN    0x40
+UMASK_RING_BL_USED_1_DOWN_ODD     0x80
+UMASK_RING_BL_USED_DOWN           0xCC
+UMASK_RING_BL_USED_UP             0x33
+UMASK_RING_BL_USED_ANY            0xFF
+
+EVENT_RING_BOUNCES              0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_BOUNCES_AK_IRQ       0x02
+UMASK_RING_BOUNCES_AK_CORE      0x04
+UMASK_RING_BOUNCES_BL_CORE      0x08
+UMASK_RING_BOUNCES_IV_CORE      0x01
+
+EVENT_RING_IV_USED              0x1E  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_IV_USED_ANY           0xFF
+UMASK_RING_IV_USED_UP            0x33
+UMASK_RING_IV_USED_DOWN          0xCC
+
+EVENT_RING_SRC_THRTL            0x07  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_SRC_THRTL            0x00
+
+EVENT_RXR_EXT_STARVED               0x12  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                0x13  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_INSERTS_IRQ            0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
+UMASK_RXR_INSERTS_IPQ            0x04
+UMASK_RXR_INSERTS_VFIFO          0x10
+
+EVENT_RXR_IPQ_RETRY                0x31  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IPQ_RETRY_ANY            0x01
+UMASK_RXR_IPQ_RETRY_FULL           0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_IRQ_RETRY                0x32  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IRQ_RETRY_ANY            0x01
+UMASK_RXR_IRQ_RETRY_FULL           0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IRQ_RETRY_RTID           0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_IRQ_RETRY_HO_CREDITS     0x20
+
+EVENT_RXR_ISMQ_RETRY                0x33  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_HO_CREDITS     0x20
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_RXR_OCCUPANCY                0x11  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_RXR_OCCUPANCY_IRQ            0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
+UMASK_RXR_OCCUPANCY_IPQ            0x04
+UMASK_RXR_OCCUPANCY_VIFO           0x10
+
+EVENT_TOR_INSERTS                    0x35  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TOR_INSERTS_OPCODE             0x01
+UMASK_TOR_INSERTS_MISS_OPCODE        0x03
+UMASK_TOR_INSERTS_EVICTION           0x04
+UMASK_TOR_INSERTS_ALL                0x08
+UMASK_TOR_INSERTS_WB                 0x10
+UMASK_TOR_INSERTS_MISS_ALL           0x0A
+UMASK_TOR_INSERTS_MISS_LOCAL         0x2A
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE  0x23
+UMASK_TOR_INSERTS_NID_OPCODE         0x41
+UMASK_TOR_INSERTS_NID_EVICTION       0x44
+UMASK_TOR_INSERTS_NID_ALL            0x48
+UMASK_TOR_INSERTS_NID_WB             0x50
+UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
+UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
+UMASK_TOR_INSERTS_REMOTE_OPCODE      0x81
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE   0x83
+UMASK_TOR_INSERTS_REMOTE             0x88
+UMASK_TOR_INSERTS_MISS_REMOTE        0x8A
+
+EVENT_TOR_OCCUPANCY                    0x36  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_TOR_OCCUPANCY_OPCODE             0x01
+UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
+UMASK_TOR_OCCUPANCY_EVICTION           0x04
+UMASK_TOR_OCCUPANCY_ALL                0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
+UMASK_TOR_OCCUPANCY_WB                 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE       0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE  0x23
+UMASK_TOR_OCCUPANCY_LOCAL              0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL         0x2A
+UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
+UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
+UMASK_TOR_OCCUPANCY_NID_ALL            0x48
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
+UMASK_TOR_OCCUPANCY_NID_WB             0x50
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE       0x81
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE  0x83
+UMASK_TOR_OCCUPANCY_REMOTE              0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE         0x8A
+
+EVENT_TXR_ADS_USED                0x04  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_ADS_USED_AD            0x01
+UMASK_TXR_ADS_USED_AK            0x02
+UMASK_TXR_ADS_USED_BL            0x04
+
+EVENT_TXR_INSERTS                0x02  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_INSERTS_AD_CACHE            0x01
+UMASK_TXR_INSERTS_AK_CACHE            0x02
+UMASK_TXR_INSERTS_BL_CACHE            0x04
+UMASK_TXR_INSERTS_IV_CACHE            0x08
+UMASK_TXR_INSERTS_AD_CORE             0x10
+UMASK_TXR_INSERTS_AK_CORE             0x20
+UMASK_TXR_INSERTS_BL_CORE             0x40
+
+EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS             0x00
+
+EVENT_ACT_COUNT                  0x01  MBOX
+UMASK_ACT_COUNT_RD                 0x01
+UMASK_ACT_COUNT_WR                 0x02
+UMASK_ACT_COUNT_BYP                0x08
+
+EVENT_BYP_CMDS                  0xA1  MBOX
+UMASK_BYP_CMDS_ACT                 0x01
+UMASK_BYP_CMDS_CAS                 0x02
+UMASK_BYP_CMDS_PRE                 0x04
+
+EVENT_CAS_COUNT                  0x04  MBOX
+UMASK_CAS_COUNT_RD_REG           0x01
+UMASK_CAS_COUNT_RD_UNDERFILL     0x02
+UMASK_CAS_COUNT_RD               0x03
+UMASK_CAS_COUNT_WR_WMM           0x04
+UMASK_CAS_COUNT_WR_RMM           0x08
+UMASK_CAS_COUNT_WR               0x0C
+UMASK_CAS_COUNT_ALL              0x0F
+UMASK_CAS_COUNT_RD_WMM           0x01
+UMASK_CAS_COUNT_RD_RMM           0x02
+
+EVENT_DRAM_PRE_ALL                  0x06  MBOX
+UMASK_DRAM_PRE_ALL                  0x00
+
+EVENT_DRAM_REFRESH                  0x05  MBOX
+UMASK_DRAM_REFRESH_PANIC            0x02
+UMASK_DRAM_REFRESH_HIGH             0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
+UMASK_ECC_CORRECTABLE_ERRORS           0x00
+
+EVENT_MAJOR_MODES                  0x07  MBOX
+UMASK_MAJOR_MODES_READ             0x01
+UMASK_MAJOR_MODES_WRITE            0x02
+UMASK_MAJOR_MODES_PARTIAL          0x04
+UMASK_MAJOR_MODES_ISOCH            0x08
+
+EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
+UMASK_POWER_CHANNEL_DLLOFF           0x00
+
+EVENT_POWER_CHANNEL_PPD           0x85  MBOX
+UMASK_POWER_CHANNEL_PPD           0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83  MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
+
+EVENT_POWER_PCU_THROTTLING           0x42  MBOX
+UMASK_POWER_PCU_THROTTLING           0x00
+
+EVENT_POWER_SELF_REFRESH           0x43  MBOX
+UMASK_POWER_SELF_REFRESH           0x00
+
+EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
+
+EVENT_PREEMPTION           0x08  MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
+
+EVENT_PRE_COUNT           0x02  MBOX
+UMASK_PRE_COUNT_PAGE_MISS           0x01
+UMASK_PRE_COUNT_PAGE_CLOSE           0x02
+
+EVENT_RD_CAS_PRIO           0xA0  MBOX
+UMASK_RD_CAS_PRIO_LOW           0x01
+UMASK_RD_CAS_PRIO_MED           0x02
+UMASK_RD_CAS_PRIO_HIGH          0x04
+UMASK_RD_CAS_PRIO_PANIC         0x08
+
+EVENT_RD_CAS_RANK0           0xB0  MBOX
+UMASK_RD_CAS_RANK0_BANK0           0x01
+UMASK_RD_CAS_RANK0_BANK1           0x02
+UMASK_RD_CAS_RANK0_BANK2           0x04
+UMASK_RD_CAS_RANK0_BANK3           0x08
+UMASK_RD_CAS_RANK0_BANK4           0x10
+UMASK_RD_CAS_RANK0_BANK5           0x20
+UMASK_RD_CAS_RANK0_BANK6           0x40
+UMASK_RD_CAS_RANK0_BANK7           0x80
+
+EVENT_RD_CAS_RANK1           0xB1  MBOX
+UMASK_RD_CAS_RANK1_BANK0           0x01
+UMASK_RD_CAS_RANK1_BANK1           0x02
+UMASK_RD_CAS_RANK1_BANK2           0x04
+UMASK_RD_CAS_RANK1_BANK3           0x08
+UMASK_RD_CAS_RANK1_BANK4           0x10
+UMASK_RD_CAS_RANK1_BANK5           0x20
+UMASK_RD_CAS_RANK1_BANK6           0x40
+UMASK_RD_CAS_RANK1_BANK7           0x80
+
+EVENT_RD_CAS_RANK2           0xB2  MBOX
+UMASK_RD_CAS_RANK2_BANK0           0x01
+UMASK_RD_CAS_RANK2_BANK1           0x02
+UMASK_RD_CAS_RANK2_BANK2           0x04
+UMASK_RD_CAS_RANK2_BANK3           0x08
+UMASK_RD_CAS_RANK2_BANK4           0x10
+UMASK_RD_CAS_RANK2_BANK5           0x20
+UMASK_RD_CAS_RANK2_BANK6           0x40
+UMASK_RD_CAS_RANK2_BANK7           0x80
+
+EVENT_RD_CAS_RANK3           0xB3  MBOX
+UMASK_RD_CAS_RANK3_BANK0           0x01
+UMASK_RD_CAS_RANK3_BANK1           0x02
+UMASK_RD_CAS_RANK3_BANK2           0x04
+UMASK_RD_CAS_RANK3_BANK3           0x08
+UMASK_RD_CAS_RANK3_BANK4           0x10
+UMASK_RD_CAS_RANK3_BANK5           0x20
+UMASK_RD_CAS_RANK3_BANK6           0x40
+UMASK_RD_CAS_RANK3_BANK7           0x80
+
+EVENT_RD_CAS_RANK4           0xB4  MBOX
+UMASK_RD_CAS_RANK4_BANK0           0x01
+UMASK_RD_CAS_RANK4_BANK1           0x02
+UMASK_RD_CAS_RANK4_BANK2           0x04
+UMASK_RD_CAS_RANK4_BANK3           0x08
+UMASK_RD_CAS_RANK4_BANK4           0x10
+UMASK_RD_CAS_RANK4_BANK5           0x20
+UMASK_RD_CAS_RANK4_BANK6           0x40
+UMASK_RD_CAS_RANK4_BANK7           0x80
+
+EVENT_RD_CAS_RANK5           0xB5  MBOX
+UMASK_RD_CAS_RANK5_BANK0           0x01
+UMASK_RD_CAS_RANK5_BANK1           0x02
+UMASK_RD_CAS_RANK5_BANK2           0x04
+UMASK_RD_CAS_RANK5_BANK3           0x08
+UMASK_RD_CAS_RANK5_BANK4           0x10
+UMASK_RD_CAS_RANK5_BANK5           0x20
+UMASK_RD_CAS_RANK5_BANK6           0x40
+UMASK_RD_CAS_RANK5_BANK7           0x80
+
+EVENT_RD_CAS_RANK6           0xB6  MBOX
+UMASK_RD_CAS_RANK6_BANK0           0x01
+UMASK_RD_CAS_RANK6_BANK1           0x02
+UMASK_RD_CAS_RANK6_BANK2           0x04
+UMASK_RD_CAS_RANK6_BANK3           0x08
+UMASK_RD_CAS_RANK6_BANK4           0x10
+UMASK_RD_CAS_RANK6_BANK5           0x20
+UMASK_RD_CAS_RANK6_BANK6           0x40
+UMASK_RD_CAS_RANK6_BANK7           0x80
+
+EVENT_RD_CAS_RANK7           0xB7  MBOX
+UMASK_RD_CAS_RANK7_BANK0           0x01
+UMASK_RD_CAS_RANK7_BANK1           0x02
+UMASK_RD_CAS_RANK7_BANK2           0x04
+UMASK_RD_CAS_RANK7_BANK3           0x08
+UMASK_RD_CAS_RANK7_BANK4           0x10
+UMASK_RD_CAS_RANK7_BANK5           0x20
+UMASK_RD_CAS_RANK7_BANK6           0x40
+UMASK_RD_CAS_RANK7_BANK7           0x80
+
+EVENT_RPQ_CYCLES_NE           0x11  MBOX
+UMASK_RPQ_CYCLES_NE           0x00
+
+EVENT_RPQ_INSERTS           0x10  MBOX
+UMASK_RPQ_INSERTS           0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY           0x91  MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY           0x00
+
+EVENT_VMSE_WR_PUSH           0x90  MBOX
+UMASK_VMSE_WR_PUSH           0x00
+
+EVENT_WMM_TO_RMM           0xC0  MBOX
+UMASK_WMM_TO_RMM           0x00
+
+EVENT_WPQ_CYCLES_FULL           0x22  MBOX
+UMASK_WPQ_CYCLES_FULL           0x00
+
+EVENT_WPQ_CYCLES_NE           0x21  MBOX
+UMASK_WPQ_CYCLES_NE           0x00
+
+EVENT_WPQ_INSERTS           0x20  MBOX
+UMASK_WPQ_INSERTS           0x00
+
+EVENT_WPQ_READ_HIT           0x23  MBOX
+UMASK_WPQ_READ_HIT           0x00
+
+EVENT_WPQ_WRITE_HIT           0x24  MBOX
+UMASK_WPQ_WRITE_HIT           0x00
+
+EVENT_WRONG_MM           0xC1  MBOX
+UMASK_WRONG_MM           0x00
+
+EVENT_WR_CAS_RANK0           0xB8  MBOX
+UMASK_WR_CAS_RANK0_BANK0           0x01
+UMASK_WR_CAS_RANK0_BANK1           0x02
+UMASK_WR_CAS_RANK0_BANK2           0x04
+UMASK_WR_CAS_RANK0_BANK3           0x08
+UMASK_WR_CAS_RANK0_BANK4           0x10
+UMASK_WR_CAS_RANK0_BANK5           0x20
+UMASK_WR_CAS_RANK0_BANK6           0x40
+UMASK_WR_CAS_RANK0_BANK7           0x80
+
+EVENT_WR_CAS_RANK1           0xB9  MBOX
+UMASK_WR_CAS_RANK1_BANK0           0x01
+UMASK_WR_CAS_RANK1_BANK1           0x02
+UMASK_WR_CAS_RANK1_BANK2           0x04
+UMASK_WR_CAS_RANK1_BANK3           0x08
+UMASK_WR_CAS_RANK1_BANK4           0x10
+UMASK_WR_CAS_RANK1_BANK5           0x20
+UMASK_WR_CAS_RANK1_BANK6           0x40
+UMASK_WR_CAS_RANK1_BANK7           0x80
+
+EVENT_WR_CAS_RANK2           0xBA  MBOX
+UMASK_WR_CAS_RANK2_BANK0           0x01
+UMASK_WR_CAS_RANK2_BANK1           0x02
+UMASK_WR_CAS_RANK2_BANK2           0x04
+UMASK_WR_CAS_RANK2_BANK3           0x08
+UMASK_WR_CAS_RANK2_BANK4           0x10
+UMASK_WR_CAS_RANK2_BANK5           0x20
+UMASK_WR_CAS_RANK2_BANK6           0x40
+UMASK_WR_CAS_RANK2_BANK7           0x80
+
+EVENT_WR_CAS_RANK3           0xBB  MBOX
+UMASK_WR_CAS_RANK3_BANK0           0x01
+UMASK_WR_CAS_RANK3_BANK1           0x02
+UMASK_WR_CAS_RANK3_BANK2           0x04
+UMASK_WR_CAS_RANK3_BANK3           0x08
+UMASK_WR_CAS_RANK3_BANK4           0x10
+UMASK_WR_CAS_RANK3_BANK5           0x20
+UMASK_WR_CAS_RANK3_BANK6           0x40
+UMASK_WR_CAS_RANK3_BANK7           0x80
+
+EVENT_WR_CAS_RANK4           0xBC  MBOX
+UMASK_WR_CAS_RANK4_BANK0           0x01
+UMASK_WR_CAS_RANK4_BANK1           0x02
+UMASK_WR_CAS_RANK4_BANK2           0x04
+UMASK_WR_CAS_RANK4_BANK3           0x08
+UMASK_WR_CAS_RANK4_BANK4           0x10
+UMASK_WR_CAS_RANK4_BANK5           0x20
+UMASK_WR_CAS_RANK4_BANK6           0x40
+UMASK_WR_CAS_RANK4_BANK7           0x80
+
+EVENT_WR_CAS_RANK5           0xBD  MBOX
+UMASK_WR_CAS_RANK5_BANK0           0x01
+UMASK_WR_CAS_RANK5_BANK1           0x02
+UMASK_WR_CAS_RANK5_BANK2           0x04
+UMASK_WR_CAS_RANK5_BANK3           0x08
+UMASK_WR_CAS_RANK5_BANK4           0x10
+UMASK_WR_CAS_RANK5_BANK5           0x20
+UMASK_WR_CAS_RANK5_BANK6           0x40
+UMASK_WR_CAS_RANK5_BANK7           0x80
+
+EVENT_WR_CAS_RANK6           0xBE  MBOX
+UMASK_WR_CAS_RANK6_BANK0           0x01
+UMASK_WR_CAS_RANK6_BANK1           0x02
+UMASK_WR_CAS_RANK6_BANK2           0x04
+UMASK_WR_CAS_RANK6_BANK3           0x08
+UMASK_WR_CAS_RANK6_BANK4           0x10
+UMASK_WR_CAS_RANK6_BANK5           0x20
+UMASK_WR_CAS_RANK6_BANK6           0x40
+UMASK_WR_CAS_RANK6_BANK7           0x80
+
+EVENT_WR_CAS_RANK7           0xBF  MBOX
+UMASK_WR_CAS_RANK7_BANK0           0x01
+UMASK_WR_CAS_RANK7_BANK1           0x02
+UMASK_WR_CAS_RANK7_BANK2           0x04
+UMASK_WR_CAS_RANK7_BANK3           0x08
+UMASK_WR_CAS_RANK7_BANK4           0x10
+UMASK_WR_CAS_RANK7_BANK5           0x20
+UMASK_WR_CAS_RANK7_BANK6           0x40
+UMASK_WR_CAS_RANK7_BANK7           0x80
+
+
+EVENT_QPI_RATE                     0x00    SBOX0FIX|SBOX1FIX|SBOX2FIX
+UMASK_QPI_RATE                     0x00
+
+EVENT_SBOX_CLOCKTICKS               0x14 SBOX0|SBOX1|SBOX2
+UMASK_SBOX_CLOCKTICKS               0x00
+
+EVENT_CTO_COUNT                     0x38 SBOX0|SBOX1|SBOX2
+OPTIONS_CTO_COUNT                   EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_CTO_COUNT                     0x00 0x01
+
+EVENT_DIRECT2CORE                               0x13 SBOX0|SBOX1|SBOX2
+OPTIONS_DIRECT2CORE_SUCCESS_RBT_HIT             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT               0x01
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS               0x02
+OPTIONS_DIRECT2CORE_FAILURE_RBT_HIT             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT               0x04
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT         EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT           0x08
+OPTIONS_DIRECT2CORE_FAILURE_MISS                EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_MISS                  0x10
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_MISS        EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS          0x20
+OPTIONS_DIRECT2CORE_FAILURE_RBT_MISS            EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS              0x40
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS    EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS      0x80
+
+EVENT_L1_POWER_CYCLES           0x12 SBOX0|SBOX1|SBOX2
+UMASK_L1_POWER_CYCLES           0x00
+
+EVENT_RXL0P_POWER_CYCLES        0x10 SBOX0|SBOX1|SBOX2
+UMASK_RXL0P_POWER_CYCLES        0x00
+
+EVENT_RXL0_POWER_CYCLES         0x0F SBOX0|SBOX1|SBOX2
+UMASK_RXL0_POWER_CYCLES         0x00
+
+EVENT_RXL_BYPASSED              0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_BYPASSED              0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0      0x1E SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS  0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB  0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS  0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM  0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP  0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR  0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VN1      0x39 SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS  0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB  0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS  0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM  0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP  0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR  0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VNA  0x1D SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VNA  0x00 0x01
+
+EVENT_RXL_CYCLES_NE             0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_CYCLES_NE             0x00
+
+EVENT_RXL_FLITS_G0              0x01 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G0_IDLE         0x01
+UMASK_RXL_FLITS_G0_DATA         0x02
+UMASK_RXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_RXL_FLITS_G1              0x02 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G1_SNP          0x01 0x01
+UMASK_RXL_FLITS_G1_HOM_REQ      0x02 0x01
+UMASK_RXL_FLITS_G1_HOM_NONREQ   0x04 0x01
+UMASK_RXL_FLITS_G1_HOM          0x06 0x01
+UMASK_RXL_FLITS_G1_DRS_DATA     0x08 0x01
+UMASK_RXL_FLITS_G1_DRS_NONDATA  0x10 0x01
+UMASK_RXL_FLITS_G1_DRS          0x18 0x01
+
+EVENT_RXL_FLITS_G2              0x03 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G2_NDR_AD       0x01 0x01
+UMASK_RXL_FLITS_G2_NDR_AK       0x02 0x01
+UMASK_RXL_FLITS_G2_NCB_DATA     0x04 0x01
+UMASK_RXL_FLITS_G2_NCB_NONDATA  0x08 0x01
+UMASK_RXL_FLITS_G2_NCB          0x0C 0x01
+UMASK_RXL_FLITS_G2_NCS          0x10 0x01
+
+EVENT_RXL_INSERTS               0x08 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS               0x00
+
+EVENT_RXL_INSERTS_DRS           0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_DRS_VN0       0x01 0x01
+UMASK_RXL_INSERTS_DRS_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_HOM           0x0C SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_HOM_VN0       0x01 0x01
+UMASK_RXL_INSERTS_HOM_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NCB           0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCB_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NCB_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NCS           0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCS_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NCS_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NDR           0x0E SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NDR_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NDR_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_SNP           0x0D SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_SNP_VN0       0x01 0x01
+UMASK_RXL_INSERTS_SNP_VN1       0x02 0x01
+
+EVENT_RXL_OCCUPANCY             0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY             0x00
+
+EVENT_RXL_OCCUPANCY_DRS         0x15 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_DRS_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_DRS_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_HOM         0x18 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_HOM_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_HOM_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCB         0x16 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCB_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NCB_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCS         0x17 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCS_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NCS_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NDR         0x1A SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NDR_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NDR_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_SNP         0x19 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_SNP_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_SNP_VN1     0x02 0x01
+
+EVENT_TXL0P_POWER_CYCLES        0x0D SBOX0|SBOX1|SBOX2
+UMASK_TXL0P_POWER_CYCLES        0x00
+
+EVENT_TXL0_POWER_CYCLES         0x0C SBOX0|SBOX1|SBOX2
+UMASK_TXL0_POWER_CYCLES         0x00
+
+EVENT_TXL_BYPASSED              0x05 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BYPASSED              0x00
+
+EVENT_TXL_CYCLES_NE             0x06 SBOX0|SBOX1|SBOX2
+UMASK_TXL_CYCLES_NE             0x00
+
+EVENT_TXL_FLITS_G0              0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G0_DATA         0x02
+UMASK_TXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_TXL_FLITS_G1              0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G1_SNP          0x01 0x01
+UMASK_TXL_FLITS_G1_HOM_REQ      0x02 0x01
+UMASK_TXL_FLITS_G1_HOM_NONREQ   0x04 0x01
+UMASK_TXL_FLITS_G1_HOM          0x06 0x01
+UMASK_TXL_FLITS_G1_DRS_DATA     0x08 0x01
+UMASK_TXL_FLITS_G1_DRS_NONDATA  0x10 0x01
+UMASK_TXL_FLITS_G1_DRS          0x18 0x01
+
+EVENT_TXL_FLITS_G2              0x01 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G2_NDR_AD       0x01 0x01
+UMASK_TXL_FLITS_G2_NDR_AK       0x02 0x01
+UMASK_TXL_FLITS_G2_NCB_DATA     0x04 0x01
+UMASK_TXL_FLITS_G2_NCB_NONDATA  0x08 0x01
+UMASK_TXL_FLITS_G2_NCB          0x0C 0x01
+UMASK_TXL_FLITS_G2_NCS          0x10 0x01
+
+EVENT_TXL_INSERTS               0x04 SBOX0|SBOX1|SBOX2
+UMASK_TXL_INSERTS               0x00
+
+EVENT_TXL_OCCUPANCY             0x07 SBOX0|SBOX1|SBOX2
+UMASK_TXL_OCCUPANCY             0x00
+
+EVENT_TXL_AD_HOM_CREDIT_ACQUIRED         0x26 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_HOM_CREDIT_OCCUPANCY        0x22 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_ACQUIRED         0x28 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_OCCUPANCY        0x24 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_ACQUIRED         0x27 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_OCCUPANCY        0x23 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_ACQUIRED         0x29 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_ACQUIRED         0x00 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_OCCUPANCY        0x25 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_OCCUPANCY        0x00 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_ACQUIRED         0x2A SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN1     0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN_SHR  0x04 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_OCCUPANCY        0x1F SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN1    0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_ACQUIRED         0x2B SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_OCCUPANCY        0x20 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_ACQUIRED         0x2C SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_OCCUPANCY        0x21 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_VNA_CREDIT_RETURNS            0x1C SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURNS            0x00 0x01
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY   0x1B SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY   0x00 0x01
+
+EVENT_UNCORE_CLOCK           0x00 UBOXFIX
+UMASK_UNCORE_CLOCK           0x00
+
+EVENT_EVENT_MSG                 0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD        0x01
+UMASK_EVENT_MSG_MSI_RCVD        0x02
+UMASK_EVENT_MSG_IPI_RCVD        0x02
+UMASK_EVENT_MSG_DOORBELL_RCVD   0x08
+UMASK_EVENT_MSG_INT_PRIO        0x10
+
+EVENT_LOCK_CYCLES               0x44 UBOX
+UMASK_LOCK_CYCLES               0x00
+
+EVENT_PHOLD_CYCLES               0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS              0x46 UBOX
+UMASK_RACU_REQUESTS              0x00
+
+EVENT_BBOX_CLOCKTICKS           0x00 BBOX0|BBOX1
+UMASK_BBOX_CLOCKTICKS           0x00
+
+EVENT_ADDR_OPC_MATCH            0x20 BBOX0|BBOX1
+OPTIONS_ADDR_OPC_MATCH_ADDR     EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+EVENT_ADDR_OPC_MATCH_ADDR       0x01
+OPTIONS_ADDR_OPC_MATCH_OPC      EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_OPC        0x02
+OPTIONS_ADDR_OPC_MATCH_FILT     EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_FILT       0x03
+OPTIONS_ADDR_OPC_MATCH_AD       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AD         0x02
+OPTIONS_ADDR_OPC_MATCH_BL       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_BL         0x02
+OPTIONS_ADDR_OPC_MATCH_AK       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AK         0x02
+
+EVENT_BT_BYPASS                 0x52 BBOX0|BBOX1
+UMASK_BT_BYPASS                 0x00
+
+EVENT_BT_CYCLES_NE              0x42 BBOX0|BBOX1
+UMASK_BT_CYCLES_NE              0x00
+
+EVENT_BT_OCCUPANCY              0x43 BBOX0|BBOX1
+UMASK_BT_OCCUPANCY_LOCAL        0x01
+UMASK_BT_OCCUPANCY_REMOTE       0x02
+UMASK_BT_OCCUPANCY_READS_LOCAL  0x04
+UMASK_BT_OCCUPANCY_READS_REMOTE 0x08
+UMASK_BT_OCCUPANCY_WRITES_LOCAL  0x10
+UMASK_BT_OCCUPANCY_WRITES_REMOTE 0x20
+
+EVENT_BYPASS_IMC                0x14 BBOX0|BBOX1
+UMASK_BYPASS_IMC_TAKEN          0x01
+UMASK_BYPASS_IMC_NOT_TAKEN      0x02
+
+EVENT_CONFLICT_CYCLES           0x0B BBOX0|BBOX1
+UMASK_CONFLICT_CYCLES_CONFLICT  0x02
+UMASK_CONFLICT_CYCLES_LAST      0x04
+UMASK_CONFLICT_CYCLES_ACKCNFLTS 0x08
+UMASK_CONFLICT_CYCLES_CMP_FWDS  0x10
+
+EVENT_DIRECT2CORE_COUNT         0x11 BBOX0|BBOX1
+UMASK_DIRECT2CORE_COUNT         0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX0|BBOX1
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE  0x13 BBOX0|BBOX1
+UMASK_DIRECT2CORE_TXN_OVERRIDE  0x00
+
+EVENT_DIRECTORY_LAT_OPT         0x41 BBOX0|BBOX1
+UMASK_DIRECTORY_LAT_OPT         0x00
+
+EVENT_DIRECTORY_LOOKUP          0x0C BBOX0|BBOX1
+UMASK_DIRECTORY_LOOKUP_SNP      0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP   0x02
+
+EVENT_DIRECTORY_UPDATE          0x0D BBOX0|BBOX1
+UMASK_DIRECTORY_UPDATE_SET      0x01
+UMASK_DIRECTORY_UPDATE_CLEAR    0x02
+UMASK_DIRECTORY_UPDATE_ANY      0x03
+
+EVENT_IGR_CREDITS_AD_QPI2       0x59 BBOX0|BBOX1
+UMASK_IGR_CREDITS_AD_QPI2       0x00
+
+EVENT_IGR_CREDITS_BL_QPI2       0x5A BBOX0|BBOX1
+UMASK_IGR_CREDITS_BL_QPI2       0x00
+
+EVENT_IGR_NO_CREDIT_CYCLES         0x22 BBOX0|BBOX1
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+
+EVENT_IMC_READS                 0x17 BBOX0|BBOX1
+UMASK_IMC_READS_NORMAL          0x01
+
+EVENT_IMC_RETRY                 0x1E BBOX0|BBOX1
+UMASK_IMC_RETRY                 0x00
+
+EVENT_IMC_WRITES                 0x1A BBOX0|BBOX1
+UMASK_IMC_WRITES_FULL            0x01
+UMASK_IMC_WRITES_PARTIAL         0x02
+UMASK_IMC_WRITES_FULL_ISOCH      0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH   0x08
+UMASK_IMC_WRITES_ALL             0x0F
+
+EVENT_IODC_CONFLICTS            0x57 BBOX0|BBOX1
+UMASK_IODC_CONFLICTS_ANY        0x01
+UMASK_IODC_CONFLICTS_LAST       0x04
+
+EVENT_IODC_INSERTS              0x56 BBOX0|BBOX1
+UMASK_IODC_INSERTS              0x00
+
+EVENT_IODC_OLEN_WBMTOI          0x58 BBOX0|BBOX1
+UMASK_IODC_OLEN_WBMTOI          0x00
+
+EVENT_OSB                       0x53 BBOX0|BBOX1
+UMASK_OSB_READS_LOCAL           0x02
+UMASK_OSB_INVITOE_LOCAL         0x04
+UMASK_OSB_REMOTE                0x08
+
+EVENT_OSB_EDR                   0x54 BBOX0|BBOX1
+UMASK_OSB_EDR_ALL               0x01
+UMASK_OSB_EDR_READS_LOCAL_I     0x02
+UMASK_OSB_EDR_READS_REMOTE_I    0x04
+UMASK_OSB_EDR_READS_LOCAL_S     0x08
+UMASK_OSB_EDR_READS_REMOTE_S    0x10
+
+EVENT_REQUESTS                  0x01 BBOX0|BBOX1
+UMASK_REQUESTS_READS_LOCAL      0x01
+UMASK_REQUESTS_READS_REMOTE     0x02
+UMASK_REQUESTS_READS            0x03
+UMASK_REQUESTS_WRITES_LOCAL     0x04
+UMASK_REQUESTS_WRITES_REMOTE    0x08
+UMASK_REQUESTS_WRITES           0x0C
+UMASK_REQUESTS_INVITOE_LOCAL    0x10
+UMASK_REQUESTS_INVITOE_REMOTE   0x20
+UMASK_REQUESTS_INVITOE          0x30
+
+EVENT_RING_AD_USED              0x3E BBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AD_USED_CW_VR1_ODD   0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+UMASK_RING_AD_USED_ANY          0xFF
+
+EVENT_RING_AK_USED              0x3F BBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AK_USED_CW_VR1_ODD   0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+UMASK_RING_AK_USED_ANY          0xFF
+
+EVENT_RING_BL_USED              0x40 BBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN  0x10
+UMASK_RING_BL_USED_CW_VR1_ODD   0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD  0x80
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+UMASK_RING_BL_USED_ANY          0xFF
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX0|BBOX1
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_SNOOP_RESP                0x21 BBOX0|BBOX1
+UMASK_SNOOP_RESP_RSPI           0x01
+UMASK_SNOOP_RESP_RSPS           0x02
+UMASK_SNOOP_RESP_RSPIFWD        0x04
+UMASK_SNOOP_RESP_RSPSFWD        0x08
+UMASK_SNOOP_RESP_RSP_WB         0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB     0x20
+UMASK_SNOOP_RESP_RSPCNFLCT      0x40
+
+EVENT_SNP_RESP_RECV_LOCAL           0x60 BBOX0|BBOX1
+UMASK_SNP_RESP_RECV_LOCAL_RSPI      0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS      0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD   0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD   0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB    0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPxFWDxWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER     0x80
+
+EVENT_TAD_REQUESTS_G0               0x1B BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G0_REGION0       0x01
+UMASK_TAD_REQUESTS_G0_REGION1       0x02
+UMASK_TAD_REQUESTS_G0_REGION2       0x04
+UMASK_TAD_REQUESTS_G0_REGION3       0x08
+UMASK_TAD_REQUESTS_G0_REGION4       0x10
+UMASK_TAD_REQUESTS_G0_REGION5       0x20
+UMASK_TAD_REQUESTS_G0_REGION6       0x40
+UMASK_TAD_REQUESTS_G0_REGION7       0x80
+
+EVENT_TAD_REQUESTS_G1               0x1C BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G1_REGION8       0x01
+UMASK_TAD_REQUESTS_G1_REGION9       0x02
+UMASK_TAD_REQUESTS_G1_REGION10      0x04
+UMASK_TAD_REQUESTS_G1_REGION11      0x08
+
+EVENT_TXR_AD_CYCLES_FULL            0x2A BBOX0|BBOX1
+UMASK_TXR_AD_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_AK                        0x0E BBOX0|BBOX1
+UMASK_TXR_AK                        0x00
+
+EVENT_TXR_AK_CYCLES_FULL            0x32 BBOX0|BBOX1
+UMASK_TXR_AK_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_BL                        0x10 BBOX0|BBOX1
+UMASK_TXR_BL_DRS_CACHE              0x01
+UMASK_TXR_BL_DRS_CORE               0x02
+UMASK_TXR_BL_DRS_QPI                0x04
+
+EVENT_TXR_BL_CYCLES_FULL            0x36 BBOX0|BBOX1
+UMASK_TXR_BL_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_BL_OCCUPANCY              0x34 BBOX0|BBOX1
+UMASK_TXR_BL_OCCUPANCY_SCHED0       0x01
+UMASK_TXR_BL_OCCUPANCY_SCHED1       0x02
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS      0x18 BBOX0|BBOX1
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_CORES_IN_C3               0x00 WBOX0FIX
+UMASK_CORES_IN_C3               0x00
+
+EVENT_CORES_IN_C6               0x00 WBOX1FIX
+UMASK_CORES_IN_C6               0x00
+
+EVENT_WBOX_CLOCKTICKS           0x00 WBOX
+UMASK_WBOX_CLOCKTICKS           0x00
+
+EVENT_CORE0_TRANSITION_CYCLES   0x70 WBOX
+UMASK_CORE0_TRANSITION_CYCLES   0x00
+
+EVENT_CORE1_TRANSITION_CYCLES   0x71 WBOX
+UMASK_CORE1_TRANSITION_CYCLES   0x00
+
+EVENT_CORE2_TRANSITION_CYCLES   0x72 WBOX
+UMASK_CORE2_TRANSITION_CYCLES   0x00
+
+EVENT_CORE3_TRANSITION_CYCLES   0x73 WBOX
+UMASK_CORE3_TRANSITION_CYCLES   0x00
+
+EVENT_CORE4_TRANSITION_CYCLES   0x74 WBOX
+UMASK_CORE4_TRANSITION_CYCLES   0x00
+
+EVENT_CORE5_TRANSITION_CYCLES   0x75 WBOX
+UMASK_CORE5_TRANSITION_CYCLES   0x00
+
+EVENT_CORE6_TRANSITION_CYCLES   0x76 WBOX
+UMASK_CORE6_TRANSITION_CYCLES   0x00
+
+EVENT_CORE7_TRANSITION_CYCLES   0x77 WBOX
+UMASK_CORE7_TRANSITION_CYCLES   0x00
+
+EVENT_CORE8_TRANSITION_CYCLES   0x78 WBOX
+UMASK_CORE8_TRANSITION_CYCLES   0x00
+
+EVENT_CORE9_TRANSITION_CYCLES   0x79 WBOX
+UMASK_CORE9_TRANSITION_CYCLES   0x00
+
+EVENT_CORE10_TRANSITION_CYCLES   0x7A WBOX
+UMASK_CORE10_TRANSITION_CYCLES   0x00
+
+EVENT_CORE11_TRANSITION_CYCLES   0x7B WBOX
+UMASK_CORE11_TRANSITION_CYCLES   0x00
+
+EVENT_CORE12_TRANSITION_CYCLES   0x7C WBOX
+UMASK_CORE12_TRANSITION_CYCLES   0x00
+
+EVENT_CORE13_TRANSITION_CYCLES   0x7D WBOX
+UMASK_CORE13_TRANSITION_CYCLES   0x00
+
+EVENT_CORE14_TRANSITION_CYCLES   0x7E WBOX
+UMASK_CORE14_TRANSITION_CYCLES   0x00
+
+EVENT_DELAYED_C_STATE_ABORT_CORE0 0x17 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE0 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE1 0x18 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE1 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE2 0x19 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE2 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE3 0x1A WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE3 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE4 0x1B WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE4 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE5 0x1C WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE5 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE6 0x1D WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE6 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE7 0x1E WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE7 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE8 0x1F WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE8 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE9 0x20 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE9 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE10 0x21 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE10 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE11 0x22 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE11 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE12 0x23 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE12 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE13 0x24 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE13 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE14 0x25 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE14 0x00 0x01
+
+EVENT_DEMOTIONS_CORE0           0x1E WBOX
+OPTIONS_DEMOTIONS_CORE0         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE0           0x00
+
+EVENT_DEMOTIONS_CORE1           0x1F WBOX
+OPTIONS_DEMOTIONS_CORE1         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE1           0x00
+
+EVENT_DEMOTIONS_CORE2           0x20 WBOX
+OPTIONS_DEMOTIONS_CORE2         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE2           0x00
+
+EVENT_DEMOTIONS_CORE3           0x21 WBOX
+OPTIONS_DEMOTIONS_CORE3         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE3           0x00
+
+EVENT_DEMOTIONS_CORE4           0x22 WBOX
+OPTIONS_DEMOTIONS_CORE4         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE4           0x00
+
+EVENT_DEMOTIONS_CORE5           0x23 WBOX
+OPTIONS_DEMOTIONS_CORE5         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE5           0x00
+
+EVENT_DEMOTIONS_CORE6           0x24 WBOX
+OPTIONS_DEMOTIONS_CORE6         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE6           0x00
+
+EVENT_DEMOTIONS_CORE7           0x25 WBOX
+OPTIONS_DEMOTIONS_CORE7         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE7           0x00
+
+EVENT_DEMOTIONS_CORE8           0x40 WBOX
+OPTIONS_DEMOTIONS_CORE8         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE8           0x00
+
+EVENT_DEMOTIONS_CORE9           0x41 WBOX
+OPTIONS_DEMOTIONS_CORE9         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE9           0x00
+
+EVENT_DEMOTIONS_CORE10           0x42 WBOX
+OPTIONS_DEMOTIONS_CORE10         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE10           0x00
+
+EVENT_DEMOTIONS_CORE11           0x43 WBOX
+OPTIONS_DEMOTIONS_CORE11         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE11           0x00
+
+EVENT_DEMOTIONS_CORE12           0x44 WBOX
+OPTIONS_DEMOTIONS_CORE12         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE12           0x00
+
+EVENT_DEMOTIONS_CORE13           0x45 WBOX
+OPTIONS_DEMOTIONS_CORE13         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE13           0x00
+
+EVENT_DEMOTIONS_CORE14           0x46 WBOX
+OPTIONS_DEMOTIONS_CORE14         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE14           0x00
+
+EVENT_FREQ_BAND0_CYCLES          0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND0_CYCLES          0x00
+
+EVENT_FREQ_BAND1_CYCLES          0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND1_CYCLES          0x00
+
+EVENT_FREQ_BAND2_CYCLES          0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND2_CYCLES          0x00
+
+EVENT_FREQ_BAND3_CYCLES          0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND3_CYCLES          0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES    0x07 WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES    0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES         0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES         0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES      0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES      0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES       0x61 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES       0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES     0x02 WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES     0x00
+
+EVENT_FREQ_TRANS_CYCLES          0x60 WBOX
+UMASK_FREQ_TRANS_CYCLES          0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_PKG_C_EXIT_LATENCY         0x26 WBOX
+UMASK_PKG_C_EXIT_LATENCY         0x00 0x01
+
+EVENT_POWER_STATE_OCCUPANCY          0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES    0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES    0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES    0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES    0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES    0x63 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES    0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE   0x03 WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE   0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE 0x02 WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE 0x01 WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE 0x00
+
+EVENT_VR_HOT_CYCLES              0x32 WBOX
+UMASK_VR_HOT_CYCLES              0x00
+
+EVENT_PBOX_CLOCKTICKS           0x01 PBOX
+UMASK_PBOX_CLOCKTICKS           0x00
+
+EVENT_RING_AD_USED              0x07 PBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AD_USED_CW_VR1_ODD   0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+UMASK_RING_AD_USED_ANY          0xFF
+
+EVENT_RING_AK_USED              0x08 PBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AK_USED_CW_VR1_ODD   0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+UMASK_RING_AK_USED_ANY          0xFF
+
+EVENT_RING_BL_USED              0x09 PBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN  0x10
+UMASK_RING_BL_USED_CW_VR1_ODD   0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD  0x80
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+UMASK_RING_BL_USED_ANY          0xFF
+
+EVENT_RING_IV_USED              0x09 PBOX
+UMASK_RING_IV_USED_CW           0x33
+UMASK_RING_IV_USED_CCW          0xCC
+UMASK_RING_IV_USED_ANY          0xFF
+
+EVENT_RXR_AK_BOUNCES            0x12 PBOX
+UMASK_RXR_AK_BOUNCES_CW         0x01
+UMASK_RXR_AK_BOUNCES_CCW        0x02
+
+EVENT_RXR_CYCLES_NE             0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB         0x10
+UMASK_RXR_CYCLES_NE_NCS         0x20
+
+EVENT_RXR_INSERTS               0x11 PBOX
+UMASK_RXR_INSERTS_NCB           0x10
+UMASK_RXR_INSERTS_NCS           0x20
+
+EVENT_RXR_OCCUPANCY             0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS         0x08
+
+EVENT_TXR_CYCLES_FULL           0x25 PBOX
+UMASK_TXR_CYCLES_FULL_AD        0x01
+UMASK_TXR_CYCLES_FULL_AK        0x02
+UMASK_TXR_CYCLES_FULL_BL        0x04
+
+EVENT_TXR_CYCLES_NE             0x23 PBOX
+UMASK_TXR_CYCLES_NE_AD          0x01
+UMASK_TXR_CYCLES_NE_AK          0x02
+UMASK_TXR_CYCLES_NE_BL          0x04
+
+EVENT_TXR_NACK_CW               0x26 PBOX
+UMASK_TXR_NACK_CW_AD            0x01
+UMASK_TXR_NACK_CW_AK            0x02
+UMASK_TXR_NACK_CW_BL            0x04
+
+EVENT_TXR_NACK_CCW              0x28 PBOX
+UMASK_TXR_NACK_CCW_AD           0x01
+UMASK_TXR_NACK_CCW_AK           0x02
+UMASK_TXR_NACK_CCW_BL           0x04
+
+EVENT_RBOX_CLOCKTICKS           0x01 RBOX
+UMASK_RBOX_CLOCKTICKS           0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY       0x2B RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0  0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1  0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2  0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3  0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4  0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5  0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6  0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7  0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY       0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8  0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9  0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10 0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11 0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12 0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13 0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14 0x40
+
+EVENT_HA_R2_BL_CREDITS_EMPTY        0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA0    0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA1    0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCB 0x04
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCS 0x08
+
+EVENT_QPI0_AD_CREDITS_EMPTY         0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY         0x2D RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY         0x2A RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY         0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_RING_AD_USED              0x07 RBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+UMASK_RING_AD_USED_ANY          0xFF
+
+EVENT_RING_AK_USED              0x08 RBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+UMASK_RING_AK_USED_ANY          0xFF
+
+EVENT_RING_BL_USED              0x09 RBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+UMASK_RING_BL_USED_ANY          0xFF
+
+EVENT_RING_IV_USED              0x0A RBOX
+UMASK_RING_IV_USED_CW           0x33
+UMASK_RING_IV_USED_CCW          0xCC
+UMASK_RING_IV_USED_ANY          0xFF
+
+EVENT_RXR_AD_BYPASSED           0x12 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_AD_BYPASSED           0x00
+
+EVENT_RXR_CYCLES_NE             0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_CYCLES_NE_HOM         0x01
+UMASK_RXR_CYCLES_NE_SNP         0x02
+UMASK_RXR_CYCLES_NE_NDR         0x04
+
+EVENT_RXR_INSERTS               0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_INSERTS_HOM           0x01
+UMASK_RXR_INSERTS_SNP           0x02
+UMASK_RXR_INSERTS_NDR           0x04
+UMASK_RXR_INSERTS_DRS           0x08
+UMASK_RXR_INSERTS_NCB           0x10
+UMASK_RXR_INSERTS_NCS           0x20
+
+EVENT_RXR_OCCUPANCY             0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM         0x01
+UMASK_RXR_OCCUPANCY_SNP         0x02
+UMASK_RXR_OCCUPANCY_NDR         0x04
+UMASK_RXR_OCCUPANCY_DRS         0x08
+UMASK_RXR_OCCUPANCY_NCB         0x10
+UMASK_RXR_OCCUPANCY_NCS         0x20
+
+EVENT_TXR_CYCLES_FULL           0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_FULL           0x00
+
+EVENT_TXR_CYCLES_NE             0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_NE             0x00
+
+EVENT_TXR_NACK_CW               0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CW_AD            0x01
+UMASK_TXR_NACK_CW_AK            0x02
+UMASK_TXR_NACK_CW_BL            0x04
+
+EVENT_TXR_NACK_CCW              0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CCW_AD           0x01
+UMASK_TXR_NACK_CCW_AK           0x02
+UMASK_TXR_NACK_CCW_BL           0x04
+
+EVENT_VN0_CREDITS_REJECT        0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_REJECT_HOM    0x01
+UMASK_VN0_CREDITS_REJECT_SNP    0x02
+UMASK_VN0_CREDITS_REJECT_NDR    0x04
+UMASK_VN0_CREDITS_REJECT_DRS    0x08
+UMASK_VN0_CREDITS_REJECT_NCB    0x10
+UMASK_VN0_CREDITS_REJECT_NCS    0x20
+
+EVENT_VN0_CREDITS_USED          0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_USED_HOM      0x01
+UMASK_VN0_CREDITS_USED_SNP      0x02
+UMASK_VN0_CREDITS_USED_NDR      0x04
+UMASK_VN0_CREDITS_USED_DRS      0x08
+UMASK_VN0_CREDITS_USED_NCB      0x10
+UMASK_VN0_CREDITS_USED_NCS      0x20
+
+EVENT_VN1_CREDITS_REJECT        0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_REJECT_HOM    0x01
+UMASK_VN1_CREDITS_REJECT_SNP    0x02
+UMASK_VN1_CREDITS_REJECT_NDR    0x04
+UMASK_VN1_CREDITS_REJECT_DRS    0x08
+UMASK_VN1_CREDITS_REJECT_NCB    0x10
+UMASK_VN1_CREDITS_REJECT_NCS    0x20
+
+EVENT_VN1_CREDITS_USED          0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_USED_HOM      0x01
+UMASK_VN1_CREDITS_USED_SNP      0x02
+UMASK_VN1_CREDITS_USED_NDR      0x04
+UMASK_VN1_CREDITS_USED_DRS      0x08
+UMASK_VN1_CREDITS_USED_NCB      0x10
+UMASK_VN1_CREDITS_USED_NCS      0x20
+
+EVENT_VNA_CREDITS_ACQUIRED      0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_ACQUIRED_AD   0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL   0x04
+
+EVENT_VNA_CREDITS_REJECT        0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_REJECT_HOM    0x01
+UMASK_VNA_CREDITS_REJECT_SNP    0x02
+UMASK_VNA_CREDITS_REJECT_NDR    0x04
+UMASK_VNA_CREDITS_REJECT_DRS    0x08
+UMASK_VNA_CREDITS_REJECT_NCB    0x10
+UMASK_VNA_CREDITS_REJECT_NCS    0x20
+
+EVENT_VNA_CREDIT_CYCLES_OUT     0x31 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_OUT     0x00
+
+EVENT_VNA_CREDIT_CYCLES_USED    0x32 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_USED    0x00
+
+EVENT_IBOX_CLOCKTICKS           0x00 IBOX
+UMASK_IBOX_CLOCKTICKS           0x00
+
+EVENT_ADDRESS_MATCH             0x17 IBOX
+UMASK_ADDRESS_MATCH_STALL_COUNT 0x01
+UMASK_ADDRESS_MATCH_MERGE_COUNT 0x02
+
+EVENT_CACHE_ACK_PENDING_OCCUPANCY        0x14 IBOX
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_ANY    0x01
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_OWN_OCCUPANCY        0x13 IBOX
+UMASK_CACHE_OWN_OCCUPANCY_ANY    0x01
+UMASK_CACHE_OWN_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_READ_OCCUPANCY        0x10 IBOX
+UMASK_CACHE_READ_OCCUPANCY_ANY    0x01
+UMASK_CACHE_READ_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_TOTAL_OCCUPANCY        0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY    0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_WRITE_OCCUPANCY        0x11 IBOX
+UMASK_CACHE_WRITE_OCCUPANCY_ANY    0x01
+UMASK_CACHE_WRITE_OCCUPANCY_SOURCE 0x02
+
+EVENT_RXR_AK_CYCLES_FULL        0x0B IBOX
+UMASK_RXR_AK_CYCLES_FULL        0x00
+
+EVENT_RXR_AK_INSERTS            0x0A IBOX
+UMASK_RXR_AK_INSERTS            0x00
+
+EVENT_RXR_AK_OCCUPANCY          0x0C IBOX
+UMASK_RXR_AK_OCCUPANCY          0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL    0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_DRS_INSERTS        0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS        0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY      0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY      0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL    0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_NCB_INSERTS        0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS        0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY      0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY      0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL    0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_NCS_INSERTS        0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS        0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY      0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY      0x00
+
+EVENT_TICKLES                   0x16 IBOX
+UMASK_TICKLES_LOST_OWNERSHIP    0x01
+UMASK_TICKLES_TOP_OF_QUEUE      0x02
+
+EVENT_TRANSACTIONS              0x15 IBOX
+UMASK_TRANSACTIONS_READS        0x01
+UMASK_TRANSACTIONS_WRITES       0x02
+UMASK_TRANSACTIONS_RD_PREFETCHES 0x04
+UMASK_TRANSACTIONS_ORDERINGQ    0x08
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB      0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB      0x00
+
+EVENT_TXR_DATA_INSERTS_NCS      0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS      0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY     0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY     0x00
+
+EVENT_WRITE_ORDERING_STALL_CYCLES 0x1A IBOX
+UMASK_WRITE_ORDERING_STALL_CYCLES 0x00
diff --git a/src/includes/perfmon_ivybridge_counters.h b/src/includes/perfmon_ivybridge_counters.h
index e63dfb0..742b230 100644
--- a/src/includes/perfmon_ivybridge_counters.h
+++ b/src/includes/perfmon_ivybridge_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_ivybridge_counters.h
  *
- *      Description: Counter header file of perfmon module for Ivy Bridge.
+ *      Description: Counter header file of perfmon module for Intel Ivy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -30,46 +31,57 @@
 
 #define NUM_COUNTERS_CORE_IVYBRIDGE 8
 #define NUM_COUNTERS_UNCORE_IVYBRIDGE 12
-#define NUM_COUNTERS_IVYBRIDGE 32
+#define NUM_COUNTERS_IVYBRIDGE 23
 
-static PerfmonCounterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
+
+#define IVB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|\
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define IVB_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define IVB_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVB_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVB_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
-    /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0FIX",PMC16, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX1C0",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C1",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C2",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C3",PMC20, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1FIX",PMC21, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX2C0",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C1",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C2",PMC24, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C3",PMC25, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2FIX",PMC26, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX3C0",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C1",PMC28, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C2",PMC29, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C3",PMC30, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, IVB_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, IVB_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap ivybridge_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+    [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
 };
 
 
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index 5318ce6..1ff619a 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_ivybridge_events.txt
-# 
+#
 #      Description:  Event list for Intel Ivy Bridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -26,8 +27,8 @@
 #
 # =======================================================================================
 
-EVENT_TEMP_CORE          0x00   TMP0
-UMASK_TEMP_CORE          0x00
+EVENT_TEMP_CORE               0x00   TMP0
+UMASK_TEMP_CORE               0x00
 
 EVENT_PWR_PKG_ENERGY          0x00   PWR0
 UMASK_PWR_PKG_ENERGY          0x00
@@ -35,101 +36,134 @@ UMASK_PWR_PKG_ENERGY          0x00
 EVENT_PWR_PP0_ENERGY          0x00   PWR1
 UMASK_PWR_PP0_ENERGY          0x00
 
-EVENT_PWR_DRAM_ENERGY          0x00   PWR3
-UMASK_PWR_DRAM_ENERGY          0x00
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
 
-EVENT_INSTR_RETIRED              0x00   FIXC0
-UMASK_INSTR_RETIRED_ANY          0x00
+EVENT_PWR_DRAM_ENERGY         0x00   PWR3
+UMASK_PWR_DRAM_ENERGY         0x00
 
-EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
-UMASK_CPU_CLK_UNHALTED_CORE      0x00
+EVENT_INSTR_RETIRED           0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY       0x00
 
-EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
-UMASK_CPU_CLK_UNHALTED_REF       0x00
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE   0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF    0x00
 
 EVENT_LD_BLOCKS                 0x03  PMC
 UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
 
-EVENT_MISALIGN_MEM_REF           0x05  PMC
+EVENT_MISALIGN_MEM_REF            0x05  PMC
 UMASK_MISALIGN_MEM_REF_LOADS      0x01
 UMASK_MISALIGN_MEM_REF_STORES     0x02
 UMASK_MISALIGN_MEM_REF_ANY        0x03
 
-EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
-UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
 
 EVENT_DTLB_LOAD_MISSES                 0x08  PMC
 UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x81
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x82
 UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x84
 
-EVENT_UOPS_ISSUED                0x0E  PMC
-UMASK_UOPS_ISSUED_ANY            0x01
-UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
-UMASK_UOPS_ISSUED_SLOW_LEA       0x20
-UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
-
-EVENT_FP_COMP_OPS_EXE            0x10   PMC
+EVENT_INT_MISC                       0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES       0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT        0x03
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE         0x10
+UMASK_UOPS_ISSUED_SLOW_LEA            0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL          0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES         0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_FLAGS_MERGE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_FLAGS_MERGE    0x10
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SLOW_LEA EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_SINGLE_MUL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_FP_COMP_OPS_EXE                          0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87                      0x01
 UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
 UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
 UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
 UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
 
-EVENT_SIMD_FP_256_PACKED       0x11   PMC
+EVENT_SIMD_FP_256_PACKED            0x11   PMC
 UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
 UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
 
 EVENT_ARITH                      0x14   PMC
 UMASK_ARITH_FPU_DIV_ACTIVE       0x01
-UMASK_ARITH_NUM_DIV              0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV              0x01
 
-EVENT_L2_RQSTS                   0x24   PMC
+EVENT_L2_RQSTS                         0x24   PMC
 UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
 UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD     0x03
-UMASK_L2_RQSTS_RFO_HITS           0x04
-UMASK_L2_RQSTS_RFO_MISS          0x08
-UMASK_L2_RQSTS_RFO_ANY           0x0C
-UMASK_L2_RQSTS_CODE_RD_HITS        0x10
-UMASK_L2_RQSTS_CODE_RD_MISS       0x20
-UMASK_L2_RQSTS_ALL_CODE_CODE_RD   0x30
-UMASK_L2_RQSTS_PF_HIT      0x40
-UMASK_L2_RQSTS_PF_MISS     0x80
-UMASK_L2_RQSTS_ALL_PF        0xC0
-UMASK_L2_RQSTS_MISS              0xAA
+UMASK_L2_RQSTS_RFO_HITS                0x04
+UMASK_L2_RQSTS_RFO_MISS                0x08
+UMASK_L2_RQSTS_RFO_ANY                 0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS            0x10
+UMASK_L2_RQSTS_CODE_RD_MISS            0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD        0x30
+UMASK_L2_RQSTS_PF_HIT                  0x40
+UMASK_L2_RQSTS_PF_MISS                 0x80
+UMASK_L2_RQSTS_ALL_PF                  0xC0
+UMASK_L2_RQSTS_MISS                    0xAA
 
 EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
 UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
-UMASK_L2_STORE_LOCK_RQSTS_HIT_M       0x08
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M      0x08
 UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
 
 EVENT_L1D_WB_RQST                  0x28   PMC
-UMASK_L1D_WB_RQST_HIT_E          0x04
-UMASK_L1D_WB_RQST_HIT_M          0x08
-UMASK_L1D_WB_RQST_ALL            0x0F
+UMASK_L1D_WB_RQST_HIT_E            0x04
+UMASK_L1D_WB_RQST_HIT_M            0x08
+UMASK_L1D_WB_RQST_ALL              0x0F
 
 EVENT_L3_LAT_CACHE               0x2E   PMC
 UMASK_L3_LAT_CACHE_REFERENCE     0x4F
 UMASK_L3_LAT_CACHE_MISS          0x41
 
-EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
 UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
 
 EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
 
-EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
+EVENT_DTLB_STORE_MISSES                      0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK        0x01
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
-UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x04
+UMASK_DTLB_STORE_MISSES_WALK_DURATION        0x04
 UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
 
-EVENT_LOAD_HIT_PRE               0x4C    PMC
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
 UMASK_LOAD_HIT_PRE_SW_PF               0x01
 UMASK_LOAD_HIT_PRE_HW_PF               0x02
 
-EVENT_L1D                        0x51   PMC
+EVENT_L1D                         0x51   PMC
 UMASK_L1D_REPLACEMENT             0x01
 UMASK_L1D_ALLOCATED_IN_M          0x02
 UMASK_L1D_M_EVICT                 0x04
@@ -142,40 +176,45 @@ UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
 UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
 
 EVENT_CPL_CYCLES               0x5C    PMC
-UMASK_CPL_CYCLES_RING0             0x01
-UMASK_CPL_CYCLES_RING123             0x02
+UMASK_CPL_CYCLES_RING0         0x01
+UMASK_CPL_CYCLES_RING123       0x02
 
-EVENT_RS_EVENTS               0x5E    PMC
+EVENT_RS_EVENTS                 0x5E    PMC
 UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
 
-EVENT_DTLB_LOAD_MISSES_STLB	0x5F PMC
-UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x04
-
-EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
-
-EVENT_CACHE_LOCK_CYCLES          0x63   PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
-
-EVENT_IDQ               0x79   PMC
-UMASK_IDQ_EMPTY         0x02
-UMASK_IDQ_MITE_UOPS     0x04
-UMASK_IDQ_DSB_UOPS      0x08
-UMASK_IDQ_MS_DSB_UOPS   0x10
-UMASK_IDQ_MS_MITE_UOPS  0x20
-UMASK_IDQ_MS_UOPS       0x30
-UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
-UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18 0x00 0x04
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x01
-UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
-UMASK_IDQ_ALL_MITE_ALL_UOPS       0x3C
-
-EVENT_ICACHE                  0x80   PMC
-UMASK_ICACHE_HITS             0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_CACHE_LOCK_CYCLES                              0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION  0x01
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_COUNT     0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION          0x02
+DEFAULT_OPTIONS_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_COUNT             0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_ALL_MITE_ALL_UOPS            0x3C
+
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HITS               0x01
 UMASK_ICACHE_MISSES             0x02
 UMASK_ICACHE_ACCESSES           0x03
 UMASK_ICACHE_IFETCH_STALL       0x04
@@ -184,7 +223,7 @@ EVENT_ITLB_MISSES                 0x85      PMC
 UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
 UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
 UMASK_ITLB_MISSES_WALK_DURATION   0x04
-UMASK_ITLB_MISSES_STLB_HIT   0x10
+UMASK_ITLB_MISSES_STLB_HIT        0x10
 
 EVENT_ILD_STALL                 0x87      PMC
 UMASK_ILD_STALL_LCP             0x01
@@ -201,9 +240,9 @@ UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
 UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0 
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60 
-UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_BR_MISP_EXEC                                      0x89   PMC
 UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
@@ -218,54 +257,100 @@ UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
 UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
 UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
 
-EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
-UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
-
-EVENT_UOPS_DISPATCHED_PORT                 0xA1   PMC
+EVENT_IDQ_UOPS_NOT_DELIVERED                            0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE                       0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE   0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK           0x01
+
+EVENT_UOPS_DISPATCHED_PORT                  0xA1   PMC
 UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
 UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
 UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
 UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
 UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD           0x10
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA           0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD        0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA       0x20
 UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
 UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
 UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS       0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS        0xFF
 
 EVENT_RESOURCE_STALLS                 0xA2   PMC
 UMASK_RESOURCE_STALLS_ANY             0x01
 UMASK_RESOURCE_STALLS_RS              0x04
-UMASK_RESOURCE_STALLS_B               0x08
+UMASK_RESOURCE_STALLS_SB              0x08
 UMASK_RESOURCE_STALLS_ROB             0x10
 
-EVENT_CYCLE_ACTIVITY                 0xA3   PMC
+EVENT_CYCLE_ACTIVITY                               0xA3   PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L2_PENDING          EVENT_OPTION_THRESHOLD=0x01
 UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
-UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING              0x02
-UMASK_CYCLE_ACTIVITY_L1D_PENDING               0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_LDM_PENDING  EVENT_OPTION_THRESHOLD=0x02
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING            0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_L1D_PENDING         EVENT_OPTION_THRESHOLD=0x08
+UMASK_CYCLE_ACTIVITY_L1D_PENDING                   0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE   EVENT_OPTION_THRESHOLD=0x04
 UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE             0x04
 
-EVENT_DSB2MITE_SWITCHES                  0xAB   PMC
-UMASK_DSB2MITE_SWITCHES_COUNT            0x01
-UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES   0x02
+EVENT_DSB2MITE_SWITCHES                 0xAB   PMC
+UMASK_DSB2MITE_SWITCHES_COUNT           0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES  0x02
 
-EVENT_DSB_FILL                         0xAC   PMC
-UMASK_DSB_FILL_EXCEED_DSB_LINES        0x08
+EVENT_DSB_FILL                          0xAC   PMC
+UMASK_DSB_FILL_EXCEED_DSB_LINES         0x08
 
-EVENT_ITLB                         0xAE   PMC
-UMASK_ITLB_ITLB_FLUSH            0x01
+EVENT_ITLB                              0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH                   0x01
 
-EVENT_OFFCORE_REQUESTS     0xB0   PMC
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
 UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
 UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
 UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
 
-EVENT_UOPS_EXECUTED               0xB1   PMC
-UMASK_UOPS_EXECUTED_THREAD            0x01
-UMASK_UOPS_EXECUTED_CORE              0x02
-
-EVENT_TLB_FLUSH          0xBD  PMC
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_TLB_FLUSH                 0xBD  PMC
 UMASK_TLB_FLUSH_DTLB_THREAD     0x01
 UMASK_TLB_FLUSH_STLB_ANY        0x20
 
@@ -274,15 +359,38 @@ UMASK_INST_RETIRED_ANY_P            0x00
 UMASK_INST_RETIRED_ALL              0x01
 
 EVENT_OTHER_ASSISTS                  0xC1  PMC
-UMASK_OTHER_ASSISTS_AVX_STORE     0x08
-UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x10
-UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
-
-EVENT_UOPS_RETIRED                  0xC2  PMC
-UMASK_UOPS_RETIRED_ALL              0x01
-UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
-
-EVENT_MACHINE_CLEARS              0xC3  PMC
+UMASK_OTHER_ASSISTS_AVX_STORE        0x08
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x20
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_MACHINE_CLEARS_COUNT              0x01
 UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
 UMASK_MACHINE_CLEARS_SMC                0x04
 UMASK_MACHINE_CLEARS_MASKMOV            0x20
@@ -291,7 +399,6 @@ EVENT_BR_INST_RETIRED               0xC4  PMC
 UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
 UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
 UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
 UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
 UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
 UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
@@ -299,31 +406,30 @@ UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
 
 EVENT_BR_MISP_RETIRED               0xC5  PMC
 UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
+UMASK_BR_MISP_RETIRED_CONDITIONAL   0x01
 UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
-UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
-UMASK_BR_MISP_RETIRED_TAKEN      0x20
+UMASK_BR_MISP_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_MISP_RETIRED_TAKEN         0x20
 
-EVENT_FP_ASSIST               0xCA  PMC
-UMASK_FP_ASSIST_X87_OUTPUT               0x02
-UMASK_FP_ASSIST_X87_INPUT                0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
-UMASK_FP_ASSIST_SIMD_INPUT               0x10
-UMASK_FP_ASSIST_ANY               0x1E
+EVENT_FP_ASSIST                     0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT          0x02
+UMASK_FP_ASSIST_X87_INPUT           0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT         0x08
+UMASK_FP_ASSIST_SIMD_INPUT          0x10
+UMASK_FP_ASSIST_ANY                 0x1E
 
 EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
 UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
 
-EVENT_MEM_UOP_RETIRED            0xD0    PMC
-UMASK_MEM_UOP_RETIRED_LOADS            0x81
-UMASK_MEM_UOP_RETIRED_STORES           0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS            0x81
+UMASK_MEM_UOPS_RETIRED_STORES           0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK      0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
 
 EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
 UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
@@ -336,326 +442,153 @@ UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
 UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
 UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
 UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL          0x7F
 
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED                   0xD2   PMC
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
 
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
-
 EVENT_BACLEARS               0xE6   PMC
 UMASK_BACLEARS_ANY           0x1F
 
-EVENT_L2_TRANS               0xF0  PMC
-UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
-UMASK_L2_TRANS_RFO           0x02
-UMASK_L2_TRANS_CODE_RD       0x04
-UMASK_L2_TRANS_ALL_PREF      0x08
-UMASK_L2_TRANS_L1D_WB        0x10
-UMASK_L2_TRANS_L2_FILL       0x20
-UMASK_L2_TRANS_L2_WB         0x40
-UMASK_L2_TRANS_ALL_REQUESTS  0x80
-
-EVENT_L2_LINES_IN                   0xF1   PMC
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PREF       0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
 UMASK_L2_LINES_IN_I           0x01
-UMASK_L2_LINES_IN_S            0x02
+UMASK_L2_LINES_IN_S           0x02
 UMASK_L2_LINES_IN_E           0x04
-UMASK_L2_LINES_IN_ALL               0x07
+UMASK_L2_LINES_IN_ALL         0x07
 
 EVENT_L2_LINES_OUT                  0xF2   PMC
 UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
 UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
-UMASK_L2_LINES_OUT_PF_CLEAN   0x04
-UMASK_L2_LINES_OUT_PF_DIRTY   0x08
-UMASK_L2_LINES_OUT_DIRTY_ALL              0x0A
-
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM     0x03
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM     0x0C
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM     0x10
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD     0x20
-
-EVENT_DRAM_CLOCKTICKS             0x00  MBOX
-UMASK_DRAM_CLOCKTICKS             0x00
-
-EVENT_ACT_COUNT                  0x01  MBOX
-UMASK_ACT_COUNT_RD                 0x01
-UMASK_ACT_COUNT_WR                 0x02
-UMASK_ACT_COUNT_BYP                0x08
-
-EVENT_BYP_CMDS                  0xA1  MBOX
-UMASK_BYP_CMDS_ACT                 0x01
-UMASK_BYP_CMDS_CAS                 0x02
-UMASK_BYP_CMDS_PRE                 0x04
-
-EVENT_CAS_COUNT                  0x04  MBOX
-UMASK_CAS_COUNT_RD_REG           0x01
-UMASK_CAS_COUNT_RD_UNDERFILL     0x02
-UMASK_CAS_COUNT_RD               0x03
-UMASK_CAS_COUNT_WR_WMM           0x04
-UMASK_CAS_COUNT_WR_RMM           0x08
-UMASK_CAS_COUNT_WR               0x0C
-UMASK_CAS_COUNT_ALL              0x0F
-UMASK_CAS_COUNT_RD_WMM           0x01
-UMASK_CAS_COUNT_RD_RMM           0x02
-
-EVENT_DRAM_PRE_ALL                  0x06  MBOX
-UMASK_DRAM_PRE_ALL                  0x00
-
-EVENT_DRAM_REFRESH                  0x05  MBOX
-UMASK_DRAM_REFRESH_PANIC            0x02
-UMASK_DRAM_REFRESH_HIGH             0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
-UMASK_ECC_CORRECTABLE_ERRORS           0x00
-
-EVENT_MAJOR_MODES                  0x07  MBOX
-UMASK_MAJOR_MODES_READ             0x01
-UMASK_MAJOR_MODES_WRITE            0x02
-UMASK_MAJOR_MODES_PARTIAL          0x04
-UMASK_MAJOR_MODES_ISOCH            0x08
-
-EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
-UMASK_POWER_CHANNEL_DLLOFF           0x00
-
-EVENT_POWER_CHANNEL_PPD           0x85  MBOX
-UMASK_POWER_CHANNEL_PPD           0x00
-
-EVENT_POWER_CKE_CYCLES                  0x83  MBOX
-UMASK_POWER_CKE_CYCLES_RANK0            0x01
-UMASK_POWER_CKE_CYCLES_RANK1            0x02
-UMASK_POWER_CKE_CYCLES_RANK2            0x04
-UMASK_POWER_CKE_CYCLES_RANK3            0x08
-UMASK_POWER_CKE_CYCLES_RANK4            0x10
-UMASK_POWER_CKE_CYCLES_RANK5            0x20
-UMASK_POWER_CKE_CYCLES_RANK6            0x40
-UMASK_POWER_CKE_CYCLES_RANK7            0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
-
-EVENT_POWER_PCU_THROTTLING           0x42  MBOX
-UMASK_POWER_PCU_THROTTLING           0x00
-
-EVENT_POWER_SELF_REFRESH           0x43  MBOX
-UMASK_POWER_SELF_REFRESH           0x00
-
-EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
-
-EVENT_PREEMPTION           0x08  MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
-
-EVENT_PRE_COUNT           0x02  MBOX
-UMASK_PRE_COUNT_PAGE_MISS           0x01
-UMASK_PRE_COUNT_PAGE_CLOSE           0x02
-
-EVENT_RD_CAS_PRIO           0xA0  MBOX
-UMASK_RD_CAS_PRIO_LOW           0x01
-UMASK_RD_CAS_PRIO_MED           0x02
-UMASK_RD_CAS_PRIO_HIGH          0x04
-UMASK_RD_CAS_PRIO_PANIC         0x08
-
-EVENT_RD_CAS_RANK0           0xB0  MBOX
-UMASK_RD_CAS_RANK0_BANK0           0x01
-UMASK_RD_CAS_RANK0_BANK1           0x02
-UMASK_RD_CAS_RANK0_BANK2           0x04
-UMASK_RD_CAS_RANK0_BANK3           0x08
-UMASK_RD_CAS_RANK0_BANK4           0x10
-UMASK_RD_CAS_RANK0_BANK5           0x20
-UMASK_RD_CAS_RANK0_BANK6           0x40
-UMASK_RD_CAS_RANK0_BANK7           0x80
-
-EVENT_RD_CAS_RANK1           0xB1  MBOX
-UMASK_RD_CAS_RANK1_BANK0           0x01
-UMASK_RD_CAS_RANK1_BANK1           0x02
-UMASK_RD_CAS_RANK1_BANK2           0x04
-UMASK_RD_CAS_RANK1_BANK3           0x08
-UMASK_RD_CAS_RANK1_BANK4           0x10
-UMASK_RD_CAS_RANK1_BANK5           0x20
-UMASK_RD_CAS_RANK1_BANK6           0x40
-UMASK_RD_CAS_RANK1_BANK7           0x80
-
-EVENT_RD_CAS_RANK2           0xB2  MBOX
-UMASK_RD_CAS_RANK2_BANK0           0x01
-UMASK_RD_CAS_RANK2_BANK1           0x02
-UMASK_RD_CAS_RANK2_BANK2           0x04
-UMASK_RD_CAS_RANK2_BANK3           0x08
-UMASK_RD_CAS_RANK2_BANK4           0x10
-UMASK_RD_CAS_RANK2_BANK5           0x20
-UMASK_RD_CAS_RANK2_BANK6           0x40
-UMASK_RD_CAS_RANK2_BANK7           0x80
-
-EVENT_RD_CAS_RANK3           0xB3  MBOX
-UMASK_RD_CAS_RANK3_BANK0           0x01
-UMASK_RD_CAS_RANK3_BANK1           0x02
-UMASK_RD_CAS_RANK3_BANK2           0x04
-UMASK_RD_CAS_RANK3_BANK3           0x08
-UMASK_RD_CAS_RANK3_BANK4           0x10
-UMASK_RD_CAS_RANK3_BANK5           0x20
-UMASK_RD_CAS_RANK3_BANK6           0x40
-UMASK_RD_CAS_RANK3_BANK7           0x80
-
-EVENT_RD_CAS_RANK4           0xB4  MBOX
-UMASK_RD_CAS_RANK4_BANK0           0x01
-UMASK_RD_CAS_RANK4_BANK1           0x02
-UMASK_RD_CAS_RANK4_BANK2           0x04
-UMASK_RD_CAS_RANK4_BANK3           0x08
-UMASK_RD_CAS_RANK4_BANK4           0x10
-UMASK_RD_CAS_RANK4_BANK5           0x20
-UMASK_RD_CAS_RANK4_BANK6           0x40
-UMASK_RD_CAS_RANK4_BANK7           0x80
-
-EVENT_RD_CAS_RANK5           0xB5  MBOX
-UMASK_RD_CAS_RANK5_BANK0           0x01
-UMASK_RD_CAS_RANK5_BANK1           0x02
-UMASK_RD_CAS_RANK5_BANK2           0x04
-UMASK_RD_CAS_RANK5_BANK3           0x08
-UMASK_RD_CAS_RANK5_BANK4           0x10
-UMASK_RD_CAS_RANK5_BANK5           0x20
-UMASK_RD_CAS_RANK5_BANK6           0x40
-UMASK_RD_CAS_RANK5_BANK7           0x80
-
-EVENT_RD_CAS_RANK6           0xB6  MBOX
-UMASK_RD_CAS_RANK6_BANK0           0x01
-UMASK_RD_CAS_RANK6_BANK1           0x02
-UMASK_RD_CAS_RANK6_BANK2           0x04
-UMASK_RD_CAS_RANK6_BANK3           0x08
-UMASK_RD_CAS_RANK6_BANK4           0x10
-UMASK_RD_CAS_RANK6_BANK5           0x20
-UMASK_RD_CAS_RANK6_BANK6           0x40
-UMASK_RD_CAS_RANK6_BANK7           0x80
-
-EVENT_RD_CAS_RANK7           0xB7  MBOX
-UMASK_RD_CAS_RANK7_BANK0           0x01
-UMASK_RD_CAS_RANK7_BANK1           0x02
-UMASK_RD_CAS_RANK7_BANK2           0x04
-UMASK_RD_CAS_RANK7_BANK3           0x08
-UMASK_RD_CAS_RANK7_BANK4           0x10
-UMASK_RD_CAS_RANK7_BANK5           0x20
-UMASK_RD_CAS_RANK7_BANK6           0x40
-UMASK_RD_CAS_RANK7_BANK7           0x80
-
-EVENT_RPQ_CYCLES_NE           0x11  MBOX
-UMASK_RPQ_CYCLES_NE           0x00
-
-EVENT_RPQ_INSERTS           0x10  MBOX
-UMASK_RPQ_INSERTS           0x00
-
-EVENT_VMSE_MXB_WR_OCCUPANCY           0x91  MBOX
-UMASK_VMSE_MXB_WR_OCCUPANCY           0x00
-
-EVENT_VMSE_WR_PUSH           0x90  MBOX
-UMASK_VMSE_WR_PUSH           0x00
-
-EVENT_WMM_TO_RMM           0xC0  MBOX
-UMASK_WMM_TO_RMM           0x00
-
-EVENT_WPQ_CYCLES_FULL           0x22  MBOX
-UMASK_WPQ_CYCLES_FULL           0x00
-
-EVENT_WPQ_CYCLES_NE           0x21  MBOX
-UMASK_WPQ_CYCLES_NE           0x00
-
-EVENT_WPQ_INSERTS           0x20  MBOX
-UMASK_WPQ_INSERTS           0x00
-
-EVENT_WPQ_READ_HIT           0x23  MBOX
-UMASK_WPQ_READ_HIT           0x00
-
-EVENT_WPQ_WRITE_HIT           0x24  MBOX
-UMASK_WPQ_WRITE_HIT           0x00
-
-EVENT_WRONG_MM           0xC1  MBOX
-UMASK_WRONG_MM           0x00
-
-EVENT_WR_CAS_RANK0           0xB8  MBOX
-UMASK_WR_CAS_RANK0_BANK0           0x01
-UMASK_WR_CAS_RANK0_BANK1           0x02
-UMASK_WR_CAS_RANK0_BANK2           0x04
-UMASK_WR_CAS_RANK0_BANK3           0x08
-UMASK_WR_CAS_RANK0_BANK4           0x10
-UMASK_WR_CAS_RANK0_BANK5           0x20
-UMASK_WR_CAS_RANK0_BANK6           0x40
-UMASK_WR_CAS_RANK0_BANK7           0x80
-
-EVENT_WR_CAS_RANK1           0xB9  MBOX
-UMASK_WR_CAS_RANK1_BANK0           0x01
-UMASK_WR_CAS_RANK1_BANK1           0x02
-UMASK_WR_CAS_RANK1_BANK2           0x04
-UMASK_WR_CAS_RANK1_BANK3           0x08
-UMASK_WR_CAS_RANK1_BANK4           0x10
-UMASK_WR_CAS_RANK1_BANK5           0x20
-UMASK_WR_CAS_RANK1_BANK6           0x40
-UMASK_WR_CAS_RANK1_BANK7           0x80
-
-EVENT_WR_CAS_RANK2           0xBA  MBOX
-UMASK_WR_CAS_RANK2_BANK0           0x01
-UMASK_WR_CAS_RANK2_BANK1           0x02
-UMASK_WR_CAS_RANK2_BANK2           0x04
-UMASK_WR_CAS_RANK2_BANK3           0x08
-UMASK_WR_CAS_RANK2_BANK4           0x10
-UMASK_WR_CAS_RANK2_BANK5           0x20
-UMASK_WR_CAS_RANK2_BANK6           0x40
-UMASK_WR_CAS_RANK2_BANK7           0x80
-
-EVENT_WR_CAS_RANK3           0xBB  MBOX
-UMASK_WR_CAS_RANK3_BANK0           0x01
-UMASK_WR_CAS_RANK3_BANK1           0x02
-UMASK_WR_CAS_RANK3_BANK2           0x04
-UMASK_WR_CAS_RANK3_BANK3           0x08
-UMASK_WR_CAS_RANK3_BANK4           0x10
-UMASK_WR_CAS_RANK3_BANK5           0x20
-UMASK_WR_CAS_RANK3_BANK6           0x40
-UMASK_WR_CAS_RANK3_BANK7           0x80
-
-EVENT_WR_CAS_RANK4           0xBC  MBOX
-UMASK_WR_CAS_RANK4_BANK0           0x01
-UMASK_WR_CAS_RANK4_BANK1           0x02
-UMASK_WR_CAS_RANK4_BANK2           0x04
-UMASK_WR_CAS_RANK4_BANK3           0x08
-UMASK_WR_CAS_RANK4_BANK4           0x10
-UMASK_WR_CAS_RANK4_BANK5           0x20
-UMASK_WR_CAS_RANK4_BANK6           0x40
-UMASK_WR_CAS_RANK4_BANK7           0x80
-
-EVENT_WR_CAS_RANK5           0xBD  MBOX
-UMASK_WR_CAS_RANK5_BANK0           0x01
-UMASK_WR_CAS_RANK5_BANK1           0x02
-UMASK_WR_CAS_RANK5_BANK2           0x04
-UMASK_WR_CAS_RANK5_BANK3           0x08
-UMASK_WR_CAS_RANK5_BANK4           0x10
-UMASK_WR_CAS_RANK5_BANK5           0x20
-UMASK_WR_CAS_RANK5_BANK6           0x40
-UMASK_WR_CAS_RANK5_BANK7           0x80
-
-EVENT_WR_CAS_RANK6           0xBE  MBOX
-UMASK_WR_CAS_RANK6_BANK0           0x01
-UMASK_WR_CAS_RANK6_BANK1           0x02
-UMASK_WR_CAS_RANK6_BANK2           0x04
-UMASK_WR_CAS_RANK6_BANK3           0x08
-UMASK_WR_CAS_RANK6_BANK4           0x10
-UMASK_WR_CAS_RANK6_BANK5           0x20
-UMASK_WR_CAS_RANK6_BANK6           0x40
-UMASK_WR_CAS_RANK6_BANK7           0x80
-
-EVENT_WR_CAS_RANK7           0xBF  MBOX
-UMASK_WR_CAS_RANK7_BANK0           0x01
-UMASK_WR_CAS_RANK7_BANK1           0x02
-UMASK_WR_CAS_RANK7_BANK2           0x04
-UMASK_WR_CAS_RANK7_BANK3           0x08
-UMASK_WR_CAS_RANK7_BANK4           0x10
-UMASK_WR_CAS_RANK7_BANK5           0x20
-UMASK_WR_CAS_RANK7_BANK6           0x40
-UMASK_WR_CAS_RANK7_BANK7           0x80
+UMASK_L2_LINES_OUT_PF_CLEAN         0x04
+UMASK_L2_LINES_OUT_PF_DIRTY         0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL        0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL        0x05
+UMASK_L2_LINES_OUT_ALL              0x0F
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED                 0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+
+EVENT_CACHE_LOOKUP                          0x34 CBOX
+UMASK_CACHE_LOOKUP_M                        0x01
+UMASK_CACHE_LOOKUP_E                        0x02
+UMASK_CACHE_LOOKUP_S                        0x04
+UMASK_CACHE_LOOKUP_I                        0x08
+UMASK_CACHE_LOOKUP_READ_FILTER              0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER             0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER            0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER       0x80
+UMASK_CACHE_LOOKUP_READ_M                   0x11
+UMASK_CACHE_LOOKUP_WRITE_M                  0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M                 0x41
+UMASK_CACHE_LOOKUP_ANY_M                    0x81
+UMASK_CACHE_LOOKUP_READ_E                   0x12
+UMASK_CACHE_LOOKUP_WRITE_E                  0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E                 0x42
+UMASK_CACHE_LOOKUP_ANY_E                    0x82
+UMASK_CACHE_LOOKUP_READ_S                   0x14
+UMASK_CACHE_LOOKUP_WRITE_S                  0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S                 0x44
+UMASK_CACHE_LOOKUP_ANY_S                    0x84
+UMASK_CACHE_LOOKUP_READ_ES                  0x16
+UMASK_CACHE_LOOKUP_WRITE_ES                 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES                0x46
+UMASK_CACHE_LOOKUP_ANY_ES                   0x86
+UMASK_CACHE_LOOKUP_READ_I                   0x18
+UMASK_CACHE_LOOKUP_WRITE_I                  0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I                 0x48
+UMASK_CACHE_LOOKUP_ANY_I                    0x88
+UMASK_CACHE_LOOKUP_READ_MESI                0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI               0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI              0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI                 0x8F
+
+EVENT_XSNP_RESPONSE                         0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS                    0x01
+UMASK_XSNP_RESPONSE_INVAL                   0x02
+UMASK_XSNP_RESPONSE_HIT                     0x04
+UMASK_XSNP_RESPONSE_HITM                    0x08
+UMASK_XSNP_RESPONSE_INVAL_M                 0x10
+UMASK_XSNP_RESPONSE_EXTERNAL_FILTER         0x20
+UMASK_XSNP_RESPONSE_XCORE_FILTER            0x40
+UMASK_XSNP_RESPONSE_EVICTION_FILTER         0x80
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL           0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE              0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION           0x81
+UMASK_XSNP_RESPONSE_INVAL_EXTERNAL          0x22
+UMASK_XSNP_RESPONSE_INVAL_XCORE             0x42
+UMASK_XSNP_RESPONSE_INVAL_EVICTION          0x82
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL            0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE               0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION            0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL           0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE              0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION           0x88
+
+EVENT_TRK_OCCUPANCY_ALL                     0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL                     0x01
+DEFAULT_OPTIONS_TRK_OCCUPANCY_CYCLES_WITH_ANY_REQUEST EVENT_OPTION_THRESHOLD=0x1
+UMASK_TRK_OCCUPANCY_CYCLES_WITH_ANY_REQUEST 0x01
+DEFAULT_OPTIONS_TRK_OCCUPANCY_CYCLES_OVER_HALF_FULL EVENT_OPTION_THRESHOLD=0xA
+UMASK_TRK_OCCUPANCY_CYCLES_OVER_HALF_FULL   0x01
+
+EVENT_TRK_REQUESTS                          0x81 UBOX
+UMASK_TRK_REQUESTS_ALL                      0x01
+UMASK_TRK_REQUESTS_WRITES                   0x20
+UMASK_TRK_REQUESTS_EVICTIONS                0x80
+
+EVENT_COH_TRK_OCCUPANCY                     0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY                     0x01
+
+EVENT_COH_TRK_REQUESTS                      0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL                  0x01
+
+EVENT_UNCORE_CLOCK                          0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                          0x01
diff --git a/src/includes/perfmon_k10.h b/src/includes/perfmon_k10.h
index cc614af..2a7bc59 100644
--- a/src/includes/perfmon_k10.h
+++ b/src/includes/perfmon_k10.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_k10.h
  *
- *      Description:  Header file of perfmon module for K10
+ *      Description:  Header file of perfmon module for AMD K10
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,119 +30,197 @@
  */
 
 #include <perfmon_k10_events.h>
-#include <perfmon_k10_groups.h>
 #include <perfmon_k10_counters.h>
+#include <error.h>
 
 static int perfmon_numCountersK10 = NUM_COUNTERS_K10;
-static int perfmon_numGroupsK10 = NUM_GROUPS_K10;
 static int perfmon_numArchEventsK10 = NUM_ARCH_EVENTS_K10;
 
-void perfmon_init_k10(PerfmonThread *thread)
+int perfmon_init_k10(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
+    return 0;
+}
 
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, 0x0ULL);
+int k10_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
 
-    //flags |= (1<<16);  /* user mode flag */
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
-    /*msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, flags);*/
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04ULL)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_k10(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_setupCounterThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t reg = k10_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    flags |= (1<<16);
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        if (type == PMC)
+        {
+            k10_pmc_setup(cpu_id, index, event);
+            eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        }
+    }
+    return 0;
+}
 
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
-    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+int perfmon_startCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            VERBOSEPRINTREG(cpu_id, counter, 0x0ULL, CLEAR_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+            VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+            flags |= (1ULL<<22);  /* enable flag */
+            VERBOSEPRINTREG(cpu_id, reg, flags, START_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+        }
     }
-    msr_write(cpu_id, reg , flags);
+    return 0;
 }
 
-void perfmon_startCountersThread_k10(int thread_id)
+int perfmon_stopCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t flags = 0x0ULL;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            msr_write(cpu_id, k10_counter_map[i].counterRegister , 0x0ULL);
-            flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
-            flags |= (1<<22);  /* enable flag */
-
-            if (perfmon_verbose)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                        LLU_CAST k10_counter_map[i].configRegister,
-                        LLU_CAST flags);
+                continue;
             }
-
-            msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+            VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+            flags &= ~(1ULL<<22);  /* clear enable flag */
+            VERBOSEPRINTREG(cpu_id, reg, flags, STOP_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+            VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+            if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_k10(int thread_id)
+int perfmon_readCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
-            flags &= ~(1<<22);  /* clear enable flag */
-            msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
-
-            if (perfmon_verbose)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                        LLU_CAST k10_counter_map[i].configRegister,
-                        LLU_CAST flags);
+                continue;
             }
-
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, k10_counter_map[i].counterRegister);
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+            VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+            if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_readCountersThread_k10(int thread_id)
+
+int perfmon_finalizeCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, k10_counter_map[i].counterRegister);
+            continue;
         }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if (reg)
+        {
+            VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    return 0;
 }
-
diff --git a/src/includes/perfmon_k10_counters.h b/src/includes/perfmon_k10_counters.h
index d01be3d..e94e29a 100644
--- a/src/includes/perfmon_k10_counters.h
+++ b/src/includes/perfmon_k10_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_k10_counters.h
  *
- *      Description:  AMD K10 specific subroutines
+ *      Description:  AMD K10 performance counter definition. Also used for AMD K8.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +32,15 @@
 #define NUM_COUNTERS_K10 4
 #define NUM_COUNTERS_CORE_K10 4
 
-static PerfmonCounterMap k10_counter_map[NUM_COUNTERS_K10] = {
-    {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0},
-    {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0},
-    {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0}
+#define K10_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap k10_counter_map[NUM_COUNTERS_K10] = {
+    {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0, K10_VALID_OPTIONS_PMC}
 };
 
+static BoxMap k10_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_k10_events.txt b/src/includes/perfmon_k10_events.txt
index 64c20e9..d45d790 100644
--- a/src/includes/perfmon_k10_events.txt
+++ b/src/includes/perfmon_k10_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_k10_events.txt
-# 
+#
 #      Description:  Event list for AMD K10
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -132,17 +133,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA         0x20
 UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA     0x40
 UMASK_DATA_CACHE_EVICTED_ALL     0x1F
 
-EVENT_DTLB_L2_HIT                0x45   PMC
-UMASK_DTLB_L2_HIT_4K             0x01
-UMASK_DTLB_L2_HIT_2M             0x02
-UMASK_DTLB_L2_HIT_1G             0x04
-UMASK_DTLB_L2_HIT_ALL             0x07
+EVENT_DTLB_L2_HIT                  0x45   PMC
+UMASK_DTLB_L2_HIT_4KB              0x01
+UMASK_DTLB_L2_HIT_2MB              0x02
+UMASK_DTLB_L2_HIT_1GB              0x04
+UMASK_DTLB_L2_HIT_ALL              0x07
 
-EVENT_DTLB_L2_MISS                0x46   PMC
-UMASK_DTLB_L2_MISS_4K             0x01
-UMASK_DTLB_L2_MISS_2M             0x02
-UMASK_DTLB_L2_MISS_1G             0x04
-UMASK_DTLB_L2_MISS_ALL            0x07
+EVENT_DTLB_L2_MISS                 0x46   PMC
+UMASK_DTLB_L2_MISS_4KB             0x01
+UMASK_DTLB_L2_MISS_2MB             0x02
+UMASK_DTLB_L2_MISS_1GB             0x04
+UMASK_DTLB_L2_MISS_ALL             0x07
 
 EVENT_MISALIGNED_ACCESS           0x47   PMC
 UMASK_MISALIGNED_ACCESS           0x00
@@ -167,10 +168,11 @@ UMASK_PREFETCH_INSTRUCTION_DISPATCHED_NTA    0x04
 EVENT_DCACHE_LOCK_MISS           0x4C   PMC
 UMASK_DCACHE_LOCK_MISS           0x02
 
-EVENT_DTLB_L1_HIT                0x4D   PMC
-UMASK_DTLB_L1_HIT_4K             0x01
-UMASK_DTLB_L1_HIT_2M             0x02
-UMASK_DTLB_L1_HIT_1G             0x04
+EVENT_DTLB_L1_HIT                 0x4D   PMC
+UMASK_DTLB_L1_HIT_4KB             0x01
+UMASK_DTLB_L1_HIT_2MB             0x02
+UMASK_DTLB_L1_HIT_1GB             0x04
+UMASK_DTLB_L1_HIT_ANY             0x07
 
 EVENT_SW_PREFETCH_HIT                0x52   PMC
 UMASK_SW_PREFETCH_HIT_L1             0x01
@@ -238,9 +240,10 @@ UMASK_ICACHE_REFILLS_MEM          0x00
 EVENT_ITLB_L2_HIT          0x84   PMC
 UMASK_ITLB_L2_HIT          0x00
 
-EVENT_ITLB_L2_MISS          0x85   PMC
-UMASK_ITLB_L2_MISS_4K       0x01
-UMASK_ITLB_L2_MISS_2M       0x02
+EVENT_ITLB_L2_MISS           0x85   PMC
+UMASK_ITLB_L2_MISS_4KB       0x01
+UMASK_ITLB_L2_MISS_2MB       0x02
+UMASK_ITLB_L2_MISS_ANY       0x03
 
 EVENT_PIPELINE_RESTART_STREAM_PROBE    0x86   PMC
 UMASK_PIPELINE_RESTART_STREAM_PROBE    0x00
diff --git a/src/includes/perfmon_k8.h b/src/includes/perfmon_k8.h
index 9313168..513929b 100644
--- a/src/includes/perfmon_k8.h
+++ b/src/includes/perfmon_k8.h
@@ -3,17 +3,17 @@
  *
  *      Filename:  perfmon_k8.h
  *
- *      Description:  Header File of perfmon module for K8 support.
- *                    Configures and reads out performance counters
- *                    on x86 based architectures. Supports multi threading.
+ *      Description:  Header File of perfmon module for AMD K8 support.
+ *                    The setup routines and registers are similar to AMD K10
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +31,9 @@
  */
 
 #include <perfmon_k8_events.h>
-#include <perfmon_k8_groups.h>
+#include <error.h>
 
 
-static int perfmon_numGroupsK8 = NUM_GROUPS_K8;
 static int perfmon_numArchEventsK8 = NUM_ARCH_EVENTS_K8;
 
 
diff --git a/src/includes/perfmon_k8_events.txt b/src/includes/perfmon_k8_events.txt
index 127b56f..48d0614 100644
--- a/src/includes/perfmon_k8_events.txt
+++ b/src/includes/perfmon_k8_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_k8_events.txt
-# 
+#
 #      Description:  Event list for AMD K8
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -97,15 +98,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA         0x20
 UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA     0x40
 UMASK_DATA_CACHE_EVICTED_ALL     0x1F
 
-EVENT_DTLB_L2_HIT                0x45     PMC
-UMASK_DTLB_L2_HIT_4K             0x01
-UMASK_DTLB_L2_HIT_2M             0x02
-UMASK_DTLB_L2_HIT_1G             0x04
+EVENT_DTLB_L2_HIT                 0x45     PMC
+UMASK_DTLB_L2_HIT_4KB             0x01
+UMASK_DTLB_L2_HIT_2MB             0x02
+UMASK_DTLB_L2_HIT_1GB             0x04
+UMASK_DTLB_L2_HIT_ANY             0x07
 
-EVENT_DTLB_L2_MISS                0x46     PMC
-UMASK_DTLB_L2_MISS_4K             0x01
-UMASK_DTLB_L2_MISS_2M             0x02
-UMASK_DTLB_L2_MISS_1G             0x04
+EVENT_DTLB_L2_MISS                 0x46     PMC
+UMASK_DTLB_L2_MISS_4KB             0x01
+UMASK_DTLB_L2_MISS_2MB             0x02
+UMASK_DTLB_L2_MISS_1GB             0x04
+UMASK_DTLB_L2_MISS_ANY             0x07
 
 EVENT_MISALIGNED_ACCESS           0x47     PMC
 UMASK_MISALIGNED_ACCESS           0x00
@@ -178,9 +181,10 @@ UMASK_ICACHE_REFILLS_MEM          0x00
 EVENT_ITLB_L2_HIT          0x84     PMC
 UMASK_ITLB_L2_HIT          0x00
 
-EVENT_ITLB_L2_MISS          0x85     PMC
-UMASK_ITLB_L2_MISS_4K       0x01
-UMASK_ITLB_L2_MISS_2M       0x02
+EVENT_ITLB_L2_MISS           0x85     PMC
+UMASK_ITLB_L2_MISS_4KB       0x01
+UMASK_ITLB_L2_MISS_2MB       0x02
+UMASK_ITLB_L2_MISS_ANY       0x03
 
 EVENT_PIPELINE_RESTART_STREAM_PROBE    0x86     PMC
 UMASK_PIPELINE_RESTART_STREAM_PROBE    0x00
diff --git a/src/includes/perfmon_kabini.h b/src/includes/perfmon_kabini.h
index 018eb04..323e713 100644
--- a/src/includes/perfmon_kabini.h
+++ b/src/includes/perfmon_kabini.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_kabini.h
  *
- *      Description:  Header file of perfmon module for AMD Family16
+ *      Description:  Header file of perfmon module for AMD Family 16
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,218 +30,331 @@
  */
 
 #include <perfmon_kabini_events.h>
-#include <perfmon_kabini_groups.h>
 #include <perfmon_kabini_counters.h>
+#include <error.h>
+#include <affinity.h>
 
 static int perfmon_numCountersKabini = NUM_COUNTERS_KABINI;
-static int perfmon_numGroupsKabini = NUM_GROUPS_KABINI;
 static int perfmon_numArchEventsKabini = NUM_ARCH_EVENTS_KABINI;
 
-void perfmon_init_kabini(PerfmonThread *thread)
+int perfmon_init_kabini(int cpu_id)
+{
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    return 0;
+}
+
+
+int k16_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
 
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, 0x0ULL);
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
+    if (event->numberOfOptions > 0)
     {
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL3, 0x0ULL);
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
     }
-
-    //flags |= (1<<16);  /* user mode flag */
-    /*msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);*/
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_kabini(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int k16_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    uint64_t reg = kabini_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    /* only one thread accesses Uncore */
-    if ( (kabini_counter_map[index].type == UNCORE) &&
-            !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        return;
+        return 0;
     }
 
-    if (kabini_counter_map[index].type == PMC)
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+    if (flags != currentConfig[cpu_id][index])
     {
-        flags |= (1<<16);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
     }
+    return 0;
+}
 
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
-    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+int k16_cache_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
 
-    if (perfmon_verbose)
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] != cpu_id)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        return 0;
     }
 
-    msr_write(cpu_id, reg , flags);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 56;
+                    break;
+                case EVENT_OPTION_NID:
+                    flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 48;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int perfmon_setupCounterThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        switch (type)
+        {
+            case PMC:
+                k16_pmc_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                k16_uncore_setup(cpu_id, index, event);
+                break;
+            case CBOX0:
+                k16_cache_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+    }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_kabini(int thread_id)
+int perfmon_startCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    int haveLock = 0;
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveTLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (kabini_counter_map[i].type == PMC)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
-                flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                flags |= (1<<22);  /* enable flag */
-
-                if (perfmon_verbose) 
-                {
-                    printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                }
-
-                msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
+                continue;
             }
-            else if ( kabini_counter_map[i].type == UNCORE )
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
-                    flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                    flags |= (1<<22);  /* enable flag */
-
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST kabini_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-
-                    msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-                }
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags |= (1ULL<<22);  /* enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
             }
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_kabini(int thread_id)
+int perfmon_stopCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t flags = 0x0ULL;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveTLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( kabini_counter_map[i].type == PMC )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                flags = msr_read(cpu_id,kabini_counter_map[i].configRegister);
-                flags &= ~(1<<22);  /* clear enable flag */
-                msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, kabini_counter_map[i].counterRegister);
-
-                if (perfmon_verbose)
-                {
-                    printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                    printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
-                }
-
+                continue;
             }
-            else if (kabini_counter_map[i].type == UNCORE)
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
             {
-                if(haveLock)
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags &= ~(1ULL<<22);  /* clear enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
                 {
-                    flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                    flags &= ~(1<<22);  /* clear enable flag */
-                    msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST kabini_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
                 }
+                eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
             }
         }
     }
+    return 0;
 }
 
 
-void perfmon_readCountersThread_kabini(int thread_id)
+int perfmon_readCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
     }
 
-
-    for (int i=0;i<NUM_COUNTERS_KABINI;i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( kabini_counter_map[i].type == UNCORE )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if ( haveLock )
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, kabini_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                VERBOSEPRINTREG(cpu_id, counter, counter_result, CLEAR_CTRL);
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                }
+                eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
             }
         }
     }
+    return 0;
 }
 
+
+int perfmon_finalizeCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveSLock = 0;
+    int haveTLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        if ((type == PMC) ||
+            ((type == UNCORE) && (haveSLock)) ||
+            ((type == CBOX0) && (haveTLock)))
+        {
+            VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        }
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_kabini_counters.h b/src/includes/perfmon_kabini_counters.h
index 8662522..e303341 100644
--- a/src/includes/perfmon_kabini_counters.h
+++ b/src/includes/perfmon_kabini_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_kabini_counters.h
  *
- *      Description:  Counter Header File of perfmon module for AMD Family16
+ *      Description:  Counter Header File of perfmon module for AMD Family 16
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,19 +29,33 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_KABINI 8 
-#define NUM_COUNTERS_CORE_KABINI 4
+#define NUM_COUNTERS_KABINI 12
+#define NUM_COUNTERS_CORE_KABINI 8
 
-static PerfmonCounterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
+#define KAB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+#define KAB_VALID_OPTIONS_CBOX EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD|EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK
+
+static RegisterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
     /* Core counters */
     {"PMC0",PMC0, PMC, MSR_AMD16_PERFEVTSEL0, MSR_AMD16_PMC0, 0, 0},
     {"PMC1",PMC1, PMC, MSR_AMD16_PERFEVTSEL1, MSR_AMD16_PMC1, 0, 0},
     {"PMC2",PMC2, PMC, MSR_AMD16_PERFEVTSEL2, MSR_AMD16_PMC2, 0, 0},
     {"PMC3",PMC3, PMC, MSR_AMD16_PERFEVTSEL3, MSR_AMD16_PMC3, 0, 0},
+    /* L2 cache counters */
+    {"CPMC0",PMC4, CBOX0, MSR_AMD16_L2_PERFEVTSEL0, MSR_AMD16_L2_PMC0, 0, 0},
+    {"CPMC1",PMC5, CBOX0, MSR_AMD16_L2_PERFEVTSEL1, MSR_AMD16_L2_PMC1, 0, 0},
+    {"CPMC2",PMC6, CBOX0, MSR_AMD16_L2_PERFEVTSEL2, MSR_AMD16_L2_PMC2, 0, 0},
+    {"CPMC3",PMC7, CBOX0, MSR_AMD16_L2_PERFEVTSEL3, MSR_AMD16_L2_PMC3, 0, 0},
     /* Northbridge counters */
-    {"UPMC0",PMC4, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
-    {"UPMC1",PMC5, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
-    {"UPMC2",PMC6, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
-    {"UPMC3",PMC7, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+    {"UPMC0",PMC8, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
+    {"UPMC1",PMC9, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
+    {"UPMC2",PMC10, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
+    {"UPMC3",PMC11, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+};
+
+static BoxMap kabini_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48},
+    [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+    [CBOX0] = {0, 0, 0, 0, 0, 0, 48},
 };
 
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index 9ccc726..a1bac4f 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_kabini_events.txt
-# 
+#
 #      Description:  Event list for AMD Kabini
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author: saravanan.ekanathan at amd.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   saravanan.ekanathan at amd.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -148,9 +148,10 @@ UMASK_PREFETCH_INSTR_DISPATCHED_NTA       0x04
 EVENT_DCACHE_LOCK_MISS           0x4C   PMC
 UMASK_DCACHE_LOCK_MISS           0x02
 
-EVENT_DTLB_L1_HIT                0x4D   PMC
-UMASK_DTLB_L1_HIT_4K             0x01
-UMASK_DTLB_L1_HIT_2M             0x02
+EVENT_DTLB_L1_HIT                 0x4D   PMC
+UMASK_DTLB_L1_HIT_4KB             0x01
+UMASK_DTLB_L1_HIT_2MB             0x02
+UMASK_DTLB_L1_HIT_ANY             0x03
 
 EVENT_INEFFECTIVE_PREFETCHES        0x52    PMC
 UMASK_INEFFECTIVE_PREFETCHES_DATA_CACHE     0x01
@@ -234,12 +235,13 @@ UMASK_INSTRUCTION_CACHE_L2_REFILLS         0x00
 EVENT_INSTRUCTION_CACHE_SYSTEM_REFILLS        0x083     PMC
 UMASK_INSTRUCTION_CACHE_SYSTEM_REFILLS         0x00
 
-EVENT_ITLB_L1_MISS_L2_HIT        0x084     PMC
-UMASK_ITLB_L1_MISS_L2_HIT         0x00
+EVENT_ITLB_L1_MISS_L2_HIT              0x084     PMC
+UMASK_ITLB_L1_MISS_L2_HIT              0x00
 
-EVENT_ITLB_L1_MISS_L2_MISS        0x085     PMC
+EVENT_ITLB_L1_MISS_L2_MISS             0x085     PMC
 UMASK_ITLB_L1_MISS_L2_MISS_4KB         0x01
 UMASK_ITLB_L1_MISS_L2_MISS_2MB         0x02
+UMASK_ITLB_L1_MISS_L2_MISS_ANY         0x03
 
 EVENT_INSTRUCTION_FETCH_STALL        0x087     PMC
 UMASK_INSTRUCTION_FETCH_STALL         0x00
diff --git a/src/includes/perfmon_nehalem.h b/src/includes/perfmon_nehalem.h
index b3e7907..6f23bd0 100644
--- a/src/includes/perfmon_nehalem.h
+++ b/src/includes/perfmon_nehalem.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalem.h
  *
- *      Description:  Header File of perfmon module for Nehalem.
+ *      Description:  Header File of perfmon module for Intel Nehalem.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,300 +30,593 @@
  */
 
 #include <perfmon_nehalem_events.h>
-#include <perfmon_nehalem_groups.h>
 #include <perfmon_nehalem_counters.h>
+#include <error.h>
+#include <affinity.h>
+
 
 static int perfmon_numCountersNehalem = NUM_COUNTERS_NEHALEM;
-static int perfmon_numGroupsNehalem = NUM_GROUPS_NEHALEM;
 static int perfmon_numArchEventsNehalem = NUM_ARCH_EVENTS_NEHALEM;
 
-#define OFFSET_PMC 3
-#define OFFSET_UPMC 7
 
-void perfmon_init_nehalem(PerfmonThread *thread)
+int perfmon_init_nehalem(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    //    flags |= (1<<22);  /* enable flag */
-    //    flags |= (1<<16);  /* user mode flag */
-    //setBit(flags,16); /* set user mode flag */
-    //setBit(flags,22); /* set enable flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
-    {
-        /* UNCORE FIXED 0: Uncore cycles */
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x01ULL);
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC2, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC3, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC4, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC5, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC6, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC7, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL);
-        msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
-
-        /* Preinit of PERFEVSEL registers */
-        //clearBit(flags,16); /* set enable flag */
-
-        /*msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);*/
-    }
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
 }
 
+uint32_t neh_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-void perfmon_setupCounterThread_nehalem(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int neh_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
+    int j;
     uint64_t flags = 0x0ULL;
-    uint64_t reg = nehalem_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    flags |= (event->umask<<8) + event->eventId;
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
     {
-        haveLock = 1;
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
     }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFF);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7)<<7;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    // Offcore event with additional configuration register
+    // cfgBits contain offset of "request type" bit
+    // cmask contain offset of "response type" bit
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    if ((event->eventId == 0xBB) &&
+        ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+int neh_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t mask_flags = 0x0ULL;
 
-    if ( nehalem_counter_map[index].type == PMC )
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        flags = (1<<16)|(1<<22);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
+        return 0;
+    }
 
-        if (event->cfgBits != 0) /* set custom cfg and cmask */
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits != 0x0 && event->eventId != 0x35) /* set custom cfg and cmask */
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    else if (event->cfgBits != 0x0 && event->eventId == 0x35) /* set custom cfg and cmask */
+    {
+        mask_flags |= ((uint64_t)event->cfgBits)<<61;
+        if (event->cmask != 0x0)
         {
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+            mask_flags |= ((uint64_t)event->cmask)<<40;
         }
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    mask_flags |= field64(event->options[j].value,3,37)<<2;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    mask_flags |= field64(event->options[j].value,0,8)<<40;
+                    break;
+                default:
+                    break;
+            }
         }
     }
-    else if ( nehalem_counter_map[index].type == UNCORE )
+    if ((mask_flags != 0x0ULL) && (event->eventId == 0x35))
     {
-        if(haveLock)
+        if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+             (cpuid_info.model == NEHALEM_LYNNFIELD_M))
         {
-            flags = (1<<22);
+            DEBUG_PLAIN_PRINT(DEBUGLEV_ONLY_ERROR, Register documented in SDM but ADDR_OPCODE_MATCH event not documented for Nehalem architectures);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, LLU_CAST mask_flags, SETUP_UNCORE_MATCH);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, mask_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int perfmon_setupCounterThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
-            msr_write(cpu_id, reg , flags);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-            if (perfmon_verbose)
-            {
-                printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
-            }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
 
+        switch (type)
+        {
+            case PMC:
+                neh_pmc_setup(cpu_id, index, event);
+                break;
+            case FIXED:
+                fixed_flags |= neh_fixed_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                if (haveLock)
+                {
+                    if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                    {
+                        neh_uncore_setup(cpu_id, index, event);
+                    }
+                    else
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, LLU_CAST 0x1ULL, SETUP_UPMCFIX);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_FIXED_CTR_CTRL, 0x1ULL));
+                    }
+                }
+                break;
+            default:
+                break;
         }
     }
-    else if (nehalem_counter_map[index].type == FIXED)
+    if (fixed_flags != 0x0ULL)
     {
-        fixed_flags |= (0x2 <<(index*4));
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-void perfmon_startCountersThread_nehalem(int thread_id)
+int perfmon_startCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
     uint64_t uflags = 0x0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
-        /* Fixed Uncore counter */
-        uflags = 0x100000000ULL;
     }
 
-    for ( int i=0; i<NUM_PMC; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == PMC)
-            {
-                msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
-            }
-            else if (nehalem_counter_map[i].type == FIXED)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                continue;
             }
-            else if (nehalem_counter_map[i].type == UNCORE)
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch(type)
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                    uflags |= (1<<(i-OFFSET_UPMC));  /* enable uncore counter */
-                }
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                    flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            uflags |= (1ULL<<(index-NUM_COUNTERS_CORE_NEHALEM));  /* enable uncore counter */
+                        }
+                        else
+                        {
+                            uflags |= (1ULL<<32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
             }
         }
     }
 
-    if (perfmon_verbose)
+    if (haveLock && (uflags != 0x0ULL) && (eventSet->regTypeMask & ~(0xF)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags));
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    if (haveLock) msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if ((flags != 0x0ULL) && (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+    }
+    return 0;
 }
 
-void perfmon_stopCountersThread_nehalem(int thread_id)
+#define NEH_CHECK_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+#define NEH_CHECK_UNCORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_STATUS, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
     }
 
-    for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+                    NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+                    NEH_CHECK_OVERFLOW(index + 32);
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+                        }
+                        else
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
+}
+
+int perfmon_readCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t pmc_flags = 0x0ULL;
+    uint64_t uncore_flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &uncore_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf ("Overflow occured \n");
-        printf ("Status: 0x%llX \n", LLU_CAST flags);
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+                    NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+                    NEH_CHECK_OVERFLOW(index + 32);
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+                        }
+                        else
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
     }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, pmc_flags, UNFREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags));
+    }
+    return 0;
 }
 
-void perfmon_readCountersThread_nehalem(int thread_id)
+int perfmon_finalizeCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
 
-    for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB) &&
+                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0x35) &&
+                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL, CLEAR_UNCORE_MATCH);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
             }
+            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
 }
 
diff --git a/src/includes/perfmon_nehalemEX.h b/src/includes/perfmon_nehalemEX.h
index ea632cf..b093ba9 100644
--- a/src/includes/perfmon_nehalemEX.h
+++ b/src/includes/perfmon_nehalemEX.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalemEX.h
  *
- *      Description:  Header File of perfmon module for Nehalem EX.
+ *      Description:  Header File of perfmon module for Intel Nehalem EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,839 +30,1248 @@
  */
 
 #include <perfmon_nehalemEX_events.h>
-#include <perfmon_nehalemEX_groups.h>
+#include <perfmon_nehalemEX_counters.h>
+#include <perfmon_nehalemEX_westmereEX_common.h>
+#include <error.h>
+#include <affinity.h>
 
-#define NUM_COUNTERS_NEHALEMEX 7
 
-//static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
-static int perfmon_numGroupsNehalemEX = NUM_GROUPS_NEHALEMEX;
 static int perfmon_numArchEventsNehalemEX = NUM_ARCH_EVENTS_NEHALEMEX;
+static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
 
 /* This SUCKS: There are only subtle difference between NehalemEX
- * and Westmere EX Uncore. Still one of them is that one field is
- * 1 bit shifted. Thank you Intel for this mess!!! Do you want
+ * and Westmere EX Uncore. Still one of them is that one field is 
+ * 1 bit shifted. Thank you Intel for this mess!!! Do you want 
  * to change the register definitions for every architecture?*/
 
+int perfmon_init_nehalemEX(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
 
-void perfmon_init_nehalemEX(PerfmonThread *thread)
+uint32_t nex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-    perfmon_verbose = 1;
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* Initialize uncore */
-    /* MBOX */
-    thread->counters[PMC7].id  = 0;
-    thread->counters[PMC8].id  = 1;
-    thread->counters[PMC9].id  = 2;
-    thread->counters[PMC10].id = 3;
-    thread->counters[PMC11].id = 4;
-    thread->counters[PMC12].id = 5;
-    westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC13].id = 0;
-    thread->counters[PMC14].id = 1;
-    thread->counters[PMC15].id = 2;
-    thread->counters[PMC16].id = 3;
-    thread->counters[PMC17].id = 4;
-    thread->counters[PMC18].id = 5;
-    westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
-    /* BBOX */
-    thread->counters[PMC19].id = 0;
-    thread->counters[PMC20].id = 1;
-    thread->counters[PMC21].id = 2;
-    thread->counters[PMC22].id = 3;
-    westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX0].statusRegister =  MSR_B0_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC23].id = 0;
-    thread->counters[PMC24].id = 1;
-    thread->counters[PMC25].id = 2;
-    thread->counters[PMC26].id = 3;
-    westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX1].statusRegister =  MSR_B1_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
-    /* RBOX */
-    thread->counters[PMC27].id = 0;
-    thread->counters[PMC28].id = 1;
-    thread->counters[PMC29].id = 2;
-    thread->counters[PMC30].id = 3;
-    thread->counters[PMC31].id = 4;
-    thread->counters[PMC32].id = 5;
-    thread->counters[PMC33].id = 6;
-    thread->counters[PMC34].id = 7;
-    westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX0].statusRegister =  MSR_R0_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC35].id = 0;
-    thread->counters[PMC36].id = 1;
-    thread->counters[PMC37].id = 2;
-    thread->counters[PMC38].id = 3;
-    thread->counters[PMC39].id = 4;
-    thread->counters[PMC40].id = 5;
-    thread->counters[PMC41].id = 6;
-    thread->counters[PMC42].id = 7;
-    westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX1].statusRegister =  MSR_R1_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
-    /* WBOX */
-    thread->counters[PMC43].id = 0;
-    thread->counters[PMC44].id = 1;
-    thread->counters[PMC45].id = 2;
-    thread->counters[PMC46].id = 3;
-    thread->counters[PMC47].id = 31;
-    westmereEX_PMunits[WBOX].ctrlRegister   = MSR_W_PMON_BOX_CTRL;
-    westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
-    westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC48].id = 0;
-    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
-    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
-    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
-    /* Set IDs for all CBOXes */
-    for (int i=PMC49; i<=PMC88; i+= 5)
-    {
-        for(int j=0; j<5; j++)
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
         {
-            thread->counters[i].id = j;
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
         }
     }
-    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC99].id = 0;
-    thread->counters[PMC100].id = 1;
-    thread->counters[PMC101].id = 2;
-    thread->counters[PMC102].id = 3;
-    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
-    thread->counters[PMC103].id = 0;
-    thread->counters[PMC104].id = 1;
-    thread->counters[PMC105].id = 2;
-    thread->counters[PMC106].id = 3;
-    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        msr_write(cpu_id, MSR_W_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL,   0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
-
-        flags = 0x0UL;
-        flags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, flags );
-    }
+    return flags;
 }
 
-/* MBOX macros */
-#define MBOX_GATE_NEHEX(NUM)  \
-flags = 0x41ULL; \
-switch (event->cfgBits)  \
-{  \
-    case 0x00:   /* primary Event */  \
-        flags |= (event->eventId<<9);  \
-        break;  \
-    case 0x01: /* secondary Events */  \
-        /* TODO fvid index is missing defaults to 0 */   \
-        flags |= (1<<7); /* toggle flag mode */   \
-        flags |= (event->eventId<<19);   \
-        switch (event->eventId)   \
-        {   \
-            case 0x00: /* CYCLES_DSP_FILL: DSP */   \
-                {   \
-                    uint64_t dsp_flags = 0x0ULL;   \
-                    dsp_flags |= (event->umask<<7);  \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-                }   \
-                break;   \
-            case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
-                {   \
-                    uint32_t iss_flags = 0x0UL;   \
-                    iss_flags |= (event->umask<<4);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-                }    \
-                break;   \
-            case 0x05: /* CYCLES_PGT_STATE: PGT */   \
-                {   \
-                    uint32_t pgt_flags = 0x0UL;   \
-                    pgt_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-                }    \
-                break;   \
-            case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
-                {   \
-                    uint32_t map_flags = 0x0UL;   \
-                    map_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
-                }   \
-                break;   \
-        }    \
-    break;   \
-    case 0x02: /* DRAM_CMD: PLD/ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pld_flags = 0x0UL;   \
-            uint32_t iss_flags = 0x0UL;   \
-            pld_flags |= (event->umask<<8);   \
-            if (event->cmask != 0)   \
-            {   \
-                iss_flags |= (event->cmask<<7);   \
-                pld_flags |= 1; /* toggle cmd flag */   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x03: /* DSP_FILL: DSP */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t dsp_flags = 0x0ULL;   \
-            dsp_flags |= (event->umask<<7);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-        }   \
-        break;   \
-    case 0x04: /* DRAM_MISC: PLD */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t pld_flags = 0x0ULL;   \
-            switch (event->cmask)   \
-            {   \
-                case 0x0:   \
-                    pld_flags |= (1<<16);   \
-                    pld_flags |= (event->umask<<19);   \
-                    break;   \
-                case 0x1:   \
-                    pld_flags |= (event->umask<<18);   \
-                    break;   \
-                case 0x2:   \
-                    pld_flags |= (event->umask<<17);   \
-                    break;   \
-                case 0x3:   \
-                    pld_flags |= (event->umask<<7);   \
-                    break;   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-        }   \
-        break;   \
-    case 0x05: /* FRM_TYPE: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-    break;   \
-    case 0x06: /* FVC_EV0: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<11);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
-        }   \
-        break;   \
-    case 0x07: /* FVC_EV1: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<14);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
-        }   \
-        break;   \
-    case 0x08: /* FVC_EV2: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<17);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
-        }   \
-        break;   \
-    case 0x09: /* FVC_EV3: FVC(ZDP) */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<20);   \
-        if (event->umask == 0x5)   \
-        {   \
-            fvc_flags |= (event->cmask<<5);   \
-        }   \
-        else   \
-        {   \
-            fvc_flags |= (event->cmask<<8);   \
-        }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-    }   \
-    break;   \
-    case 0x0A: /* ISS_SCHED: ISS */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t iss_flags = 0x0UL;   \
-        iss_flags |= (event->umask<<10);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-    }   \
-    break;   \
-    case 0x0B: /* PGT_PAGE_EV: PGT */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= event->umask;   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
-    case 0x0C: /* PGT_PAGE_EV2: PGT */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= (event->umask<<11);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
-    case 0x0D: /* THERM_TRP_DN: THR */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t thr_flags = 0x0UL;   \
-        thr_flags |= (1<<3);   \
-        thr_flags |= (event->umask<<9);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
-    }   \
-    break;   \
-}
-
-
-void perfmon_setupCounterThread_nehalemEX(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int nex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
+    int j;
     uint64_t flags = 0x0ULL;
-    int haveLock = 0;
+    uint64_t offcore_flags = 0x0ULL;
     uint64_t reg = counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    flags |= (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    if (event->cfgBits != 0 &&
+       ((event->eventId != 0xB7) || (event->eventId != 0xBB)))
     {
-        haveLock = 1;
+        /* set custom cfg and cmask */
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
     }
 
-    switch (counter_map[index].type)
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-            flags = (1<<22)|(1<<16);
-
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
-
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_PMC)
+        currentConfig[cpu_id][index] = flags;
+    }
 
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
-            break;
+    return 0;
+}
 
-        case FIXED:
-            fixed_flags |= (0x2<<(index*4));
-            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
-            break;
 
-        case MBOX0:
-            if (haveLock)
+
+
+
+int nex_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x41ULL;
+    uint64_t subflags1 = 0x0ULL;
+    uint64_t subflags2 = 0x0ULL;
+    int number;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (((counter_map[index].configRegister& 0xFF0) == 0xCA0) ||
+       ((counter_map[index].configRegister& 0xFF0) == 0xCB0))
+        number = 0;
+    else
+        number = 1;
+
+    if (event->numberOfOptions > 0 && (event->cfgBits == 0x02 || event->cfgBits == 0x04))
+    {
+        for (int j=0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                MBOX_GATE_NEHEX(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+                case EVENT_OPTION_MATCH0:
+                    subflags2 = (event->options[j].value & 0x3FFFFFFFFULL);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2, SETUP_MBOX_ADDR_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    subflags2 = ((event->options[j].value & 0x1FFFFFFC0ULL)>>6);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MASK], subflags2));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MASK], subflags2, SETUP_MBOX_ADDR_MASK);
+                    break;
+                default:
+                    break;
             }
+        }
+        subflags2 = 0x0ULL;
+    }
+    switch (event->cfgBits)
+    {
+        case 0x00:
+            flags |= (event->eventId & 0x1FULL)<<9; 
             break;
-
-        case MBOX1:
-            if (haveLock)
+        case 0x01:
+            flags |= (1ULL<<7);
+            flags |= (event->eventId & 0x7ULL)<<19;
+            switch (event->eventId)
             {
-                MBOX_GATE_NEHEX(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+                case 0x00:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+                    subflags1 |= (event->umask & 0xFULL)<<7;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+                    break;
+                case 0x01:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+                    subflags1 |= (event->umask & 0x7ULL)<<4;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+                    break;
+                case 0x05:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+                    subflags1 |= (event->umask & 0x1ULL)<<6;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+                    break;
+                case 0x06:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], &subflags1));
+                    subflags1 |= (event->umask & 0x7ULL)<<6;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][MAP], subflags1, SETUP_MBOX_MAP);
+                    break;
             }
             break;
-
-        case BBOX0:
-        case BBOX1:
-            if (haveLock)
+        case 0x02:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags2));
+            subflags1 |= (event->umask & 0x1FULL)<<8;
+            if ((event->cmask & 0xF0ULL) != 0)
             {
-                flags = 0x1ULL; /* set enable bit */
-                flags |=  (event->eventId<<1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+                subflags1 |= (1ULL<<0);
             }
+            if ((event->cmask & 0xFULL) != 0)
+            {
+                subflags2 |= (event->cmask & 0x7ULL)<<7;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags2));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags2, SETUP_MBOX_ISS);
             break;
-
-        case RBOX0:
-            if (haveLock)
+        case 0x03:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+            subflags1 |= (event->umask & 0xFULL)<<7;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+            break;
+        case 0x04:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+            switch (event->cmask)
             {
-                RBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+                case 0x0:
+                    subflags1 |= (1ULL<<16);
+                    subflags1 |= (event->umask & 0x1FULL)<<19;
+                    break;
+                case 0x1:
+                    subflags1 |= (event->umask & 0x1ULL)<<18;
+                    break;
+                case 0x2:
+                    subflags1 |= (event->umask & 0x1ULL)<<17;
+                    break;
+                case 0x3:
+                    subflags1 |= (event->umask & 0x1ULL)<<7;
+                    break;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
             break;
-
-        case RBOX1:
-            if (haveLock)
+        case 0x05:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+            subflags1 |= (event->umask & 0xFULL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+            break;
+        case 0x06:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<12;
+            if (event->umask == 0x5)
             {
-                RBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
             }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
             break;
-
-        case WBOX:
-            if (haveLock)
+        case 0x07:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<15;
+            if (event->umask == 0x5)
             {
-                if (event->eventId == 0xFF)  /* Fixed Counter */
-                {
-                    flags = 0x1ULL; /* set enable bit */
-                }
-                else
-                {
-                    flags |= (1<<22); /* set enable bit */
-                    flags |= (event->umask<<8) + event->eventId;
-                }
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
             }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
             break;
-
-        case UBOX:
-            if (haveLock)
+        case 0x08:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<18;
+            if (event->umask == 0x5)
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= event->eventId;
-                fprintf(stderr, "Setup UBOX with value 0x%llx in register 0x%llx, event 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, UBOX_CTRL)
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
             }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
             break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-            if (haveLock)
-            {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8) + event->eventId;
-                fprintf(stderr, "Setup CBOX with value 0x%llx in register 0x%llx, event 0x%x umask 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId, event->umask);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, CBOX_CTRL)
+        case 0x09:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<21;
+            if (event->umask == 0x5)
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
+            }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+            break;
+        case 0x0A:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL)<<10;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+            break;
+        case 0x0B:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+            break;
+        case 0x0C:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL)<<11;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
             break;
-        case SBOX0:
-        case SBOX1:
-            if (haveLock)
+        case 0x0D:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+            subflags1 |= (event->umask & 0x3ULL)<<9;
+            if (event->cmask == 0x0)
+            {
+                subflags1 |= (1ULL<<3);
+            }
+            else
+            {
+                subflags1 &= ~(1ULL<<3);
+                subflags1 |= (event->cmask & 0x7ULL)<<4;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
+            break;
+        case 0x0E:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+            subflags1 |= (event->umask & 0x3ULL)<<7;
+            if (event->cmask == 0x0)
+            {
+                subflags1 |= (1ULL<<3);
+            }
+            else
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, SBOX_CTRL)
+                subflags1 &= ~(1ULL<<3);
+                subflags1 |= (event->cmask & 0x7ULL)<<4;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
             break;
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX)
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
 
-        default:
-            /* should never be reached */
+
+int nex_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x01ULL;
+    uint64_t subflags = 0x0ULL;
+    int number;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if ((counter_map[index].configRegister & 0xFF0) == 0xE10)
+        number = 0;
+    else if ((counter_map[index].configRegister & 0xFF0) == 0xE30)
+        number = 1;
+
+    switch (event->eventId) {
+        case 0x00:
+            flags |= (event->umask & 0x1FULL)<<1;
+            subflags |= (event->cfgBits<<event->cmask);
+            switch (event->umask)
+            {
+                case 0x00:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][0], subflags));
+                    break;
+                case 0x01:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][0], subflags));
+                    break;
+                case 0x06:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][1], subflags));
+                    break;
+                case 0x07:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][1], subflags));
+                    break;
+                case 0x0C:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][2], subflags));
+                    break;
+                case 0x0D:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][2], subflags));
+                    break;
+                case 0x12:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][3], subflags));
+                    break;
+                case 0x13:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][3], subflags));
+                    break;
+            }
+            break;
+        case 0x01:
+            flags |= (event->umask & 0x1FULL)<<1;
+            subflags |= (event->cfgBits & 0xFULL);
+            if (event->cmask != 0x0)
+            {
+                subflags |= (event->cmask & 0xFULL)<<4;
+            }
+            switch (event->umask)
+            {
+                case 0x02:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], subflags));
+                    break;
+                case 0x03:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], (subflags<<8)));
+                    break;
+                case 0x08:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], subflags));
+                    break;
+                case 0x09:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], (subflags<<8)));
+                    break;
+                case 0x0E:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], subflags));
+                    break;
+                case 0x0F:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], (subflags<<8)));
+                    break;
+                case 0x14:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], subflags));
+                    break;
+                case 0x15:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], (subflags<<8)));
+                    break;
+            }
             break;
     }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_RBOX)
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
+int nex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x1ULL; /* set enable bit */
+    uint64_t reg = counter_map[index].configRegister;
+    RegisterType type = counter_map[index].type;
 
-/* Actions for Performance Monitoring Session:
- *
- * Core Counters (counter is always enabled in PERVSEL register):
- * 1) Disable counters in global ctrl Register MSR_PERF_GLOBAL_CTRL
- * 2) Zero according counter registers
- * 3) Set enable bit in global register flag
- * 4) Write global register flag
- *
- * Uncore Counters (only one core per socket):
- * 1) Set reset flag in global U Box control register
- * 2) Zero according counter registers
- * 3) Set enable bit in Box control register
- * 4) Write according uncore Box ctrl register
- * 3) Set enable bit in global U Box control register
- * */
-
-void perfmon_startCountersThread_nehalemEX(int thread_id)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags |= (event->eventId<<1);
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_MATCH0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MASK);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_BBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int nex_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
+    int j;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags[NUM_UNITS];
-    int enable_ubox = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t reg = counter_map[index].configRegister;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |=(event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_CBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+int nex_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+    int j;
 
-    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<29); /* reset all */
-        haveLock = 1;
-        //        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
-        //       VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
+        return 0;
     }
+    flags |= (1ULL<<22); /* set enable bit */
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int nex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    int match_mask = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+    RegisterType type = counter_map[index].type;
 
-    for ( int i=0; i<NUM_UNITS; i++ )
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        uflags[i] = 0x0UL;
+        return 0;
     }
 
-    for ( int i=0; i<NUM_PMC; i++ )
+    flags = (1ULL<<22);
+    flags |=(event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
-            if (westmereEX_counter_map[i].type == PMC)
+        if (event->eventId == 0x0)
+        {
+            for (j = 0; j < event->numberOfOptions; j++)
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                if ((event->options[j].type == EVENT_OPTION_MATCH0) ||
+                    (event->options[j].type == EVENT_OPTION_MASK0))
+                {
+                    match_mask = 1;
+                    break;
+                }
+            }
+            if (match_mask) {
+                
+                if (type == SBOX0)
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
+                }
+                else
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
+                }
             }
-            else if (westmereEX_counter_map[i].type == FIXED)
+        }
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value, SETUP_SBOX_MATCH);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value, SETUP_SBOX_MASK);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+        if (match_mask)
+        {
+            if (type == SBOX0)
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, (1ULL<<63)));
             }
-            else if (westmereEX_counter_map[i].type > UNCORE)
+            else
             {
-                if(haveLock)
+                VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, (1ULL<<63)));
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_SBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+#define NEX_FREEZE_UNCORE \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp &= ~(1ULL<<28); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, FREEZE_UNCORE) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+    }
+
+
+int perfmon_setupCounterThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t ubox_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        flags = 0x0ULL;
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                nex_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= nex_fixed_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0:
+            case MBOX1:
+                nex_mbox_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                nex_bbox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+            case RBOX1:
+                nex_rbox_setup(cpu_id, index, event);
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+                nex_cbox_setup(cpu_id, index, event);
+                break;
+
+            case SBOX0:
+            case SBOX1:
+                nex_sbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                nex_wbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX0FIX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
                 {
-                    msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                    uflags[westmereEX_counter_map[i].type] |=
-                        (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
-                    if (westmereEX_counter_map[i].type == UBOX)
+                    flags = 0x1ULL;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOXFIX)
+                    eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
+                }
+                break;
+
+            case UBOX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(UBOX)))
+                {
+                    flags |= (1ULL<<22); /* set enable bit */
+                    flags |= event->eventId;
+                    for (int j=0;j<event->numberOfOptions;j++)
                     {
-                        enable_ubox = 1;
+                        if (event->options[j].type == EVENT_OPTION_EDGE)
+                        {
+                            flags |= (1ULL<<18);
+                            break;
+                        }
                     }
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOX);
+                    ubox_flags = 0x1ULL;
                 }
-            }
+                break;
+
+            default:
+                break;
         }
     }
 
-    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
+    if (fixed_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    if (ubox_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
+    }
+    return 0;
+}
 
-    if (haveLock)
+#define NEX_RESET_ALL_UNCORE_COUNTERS \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp |= (1ULL<<29); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, RESET_ALL_UNCORE_COUNTERS); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0U)); \
+    }
+
+#define NEX_UNFREEZE_UNCORE \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp |= (1ULL<<28); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+    }
+
+#define NEX_UNFREEZE_BOX(id, flags) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST flags, UNFREEZE_BOX); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, flags)); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, flags)); \
+    }
+
+int perfmon_startCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t core_ctrl_flags = 0x0ULL;
+    uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        for ( int i=0; i<NUM_UNITS; i++ )
+        haveLock = 1;
+    }
+
+    NEX_RESET_ALL_UNCORE_COUNTERS;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            /* if counters are enabled write the according box ctrl register */
-            if (uflags[i])
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
             {
-                msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
-                VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index+32));
+                    break;
+                case WBOX0FIX:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(WBOX0FIX)))
+                    {
+                        uflags[WBOX] |= (1ULL<<31);
+                    }
+                    break;
+                default:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        uflags[counter_map[index].type] |= (1<<getCounterTypeOffset(index));
+                    }
+                    break;
             }
         }
+    }
 
-        /* set global enable flag in U BOX ctrl register */
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<28); /* enable all */
-        if (enable_ubox)
+    if (haveLock)
+    {
+        for ( int i=0; i<NUM_UNITS; i++ )
         {
-            ubflags |= (1<<0);
+            if (uflags[i] != 0x0U)
+            {
+                NEX_UNFREEZE_BOX(i, uflags[i]);
+            }
         }
-        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
+
+    NEX_UNFREEZE_UNCORE;
+
     /* Finally enable counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+    }
+    return 0;
 }
 
-void perfmon_stopCountersThread_nehalemEX(int thread_id)
+#define NEX_CHECK_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].statusRegister, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+#define NEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
         haveLock = 1;
-        //        ubflags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    NEX_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                    VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                        LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                        NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                        VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-#if 0
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
-    {
-        printf ("Overflow occured \n");
-    }
-#endif
+    return 0;
 }
 
-void perfmon_readCountersThread_nehalemEX(int thread_id)
+int perfmon_readCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_ctrl_flags = 0x0ULL;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+    }
+    NEX_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                        VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+
+    NEX_UNFREEZE_UNCORE;
+    if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+    }
+    return 0;
 }
 
+int perfmon_finalizeCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (event->eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (event->eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            case MBOX0:
+            case MBOX1:
+                if (haveLock && ((event->cfgBits == 0x02) || (event->cfgBits == 0x04)))
+                {
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case SBOX0:
+                if (haveLock && (event->eventId == 0x00))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case SBOX1:
+                if (haveLock && (event->eventId == 0x00))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case BBOX0:
+            case BBOX1:
+                if (haveLock && ((event->eventId == 0x01) ||
+                                 (event->eventId == 0x02) ||
+                                 (event->eventId == 0x03) ||
+                                 (event->eventId == 0x04) ||
+                                 (event->eventId == 0x05) ||
+                                 (event->eventId == 0x06)))
+                {
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            default:
+                break;
+        }
+        if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_nehalemEX_counters.h b/src/includes/perfmon_nehalemEX_counters.h
new file mode 100644
index 0000000..d40da5c
--- /dev/null
+++ b/src/includes/perfmon_nehalemEX_counters.h
@@ -0,0 +1,185 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_nehalemEX_counters.h
+ *
+ *      Description: Counter Header File of perfmon module for Intel Westmere EX.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_CORE_NEHALEMEX 7
+#define NUM_COUNTERS_UNCORE_NEHALEMEX 105
+#define NUM_COUNTERS_NEHALEMEX 105
+
+#define NEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define NEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap nehalemEX_counter_map[NUM_COUNTERS_NEHALEMEX] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEX_VALID_OPTIONS_PMC},
+    /* MBOX */
+    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    /* BBOX */
+    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    /* RBOX */
+    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* WBOX */
+    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* UBOX */
+    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
+    /* CBOXes */
+    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    /* SBOXes */
+    {"SBOX0C0",PMC97, SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC98, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC99, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC101, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC102, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX}
+};
+
+
+static BoxMap nehalemEX_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+    [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+    [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+    [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+    [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+    [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+    [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 565f5ca..1c4cf31 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_nehalemEX_events.txt
-# 
-#      Description:  Event list for Intel NehalemEX
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Nehalem EX
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -69,12 +70,6 @@ UMASK_DTLB_LOAD_MISSES_PDE_MISS        0x20
 UMASK_DTLB_LOAD_MISSES_PDP_MISS        0x40
 UMASK_DTLB_LOAD_MISSES_LARGE_WALK_COMPLETED  0x80
 
-EVENT_MEMORY_DISAMBIGURATION      0x09   PMC
-UMASK_MEMORY_DISAMBIGURATION_RESET         0x01 
-UMASK_MEMORY_DISAMBIGURATION_SUCCESS       0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCHDOG      0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCH_CYCLES  0x01 
-
 EVENT_MEM_INST_RETIRED           0x0B  PMC
 UMASK_MEM_INST_RETIRED_LOADS     0x01
 UMASK_MEM_INST_RETIRED_STORES    0x02
@@ -84,8 +79,8 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
 UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02 
@@ -519,8 +514,12 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
-EVENT_UNCORE_CYCLES                  0xFF  WBOX4
-UMASK_UNCORE_CYCLES                  0x00
+EVENT_OFFCORE_RESPONSE_0                              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                      EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                         0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK                  0xFF  WBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
 
 EVENT_C_CYCLES_TURBO                  0x04  WBOX
 UMASK_C_CYCLES_TURBO_C0               0x01
@@ -592,40 +591,32 @@ UMASK_TM1_ON_C7               0x80
 UMASK_TM1_ON_C_ALL            0xFF
 
 EVENT_BBOX_CMDS_ALL                  0x1A  MBOX
-UMASK_BBOX_CMDS_ALL                  0xFF
+UMASK_BBOX_CMDS_ALL                  0x00 0x00 0x00
 
-EVENT_BCMD_SCHEDQ_OCCUPANCY           0x06  MBOX
-UMASK_BCMD_SCHEDQ_OCCUPANCY_READS     0x00 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES    0x01 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE     0x02 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F       0x03 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V       0x04 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V       0x05 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
+EVENT_REFRESH                        0x06  MBOX
+UMASK_REFRESH                        0x00 0x00 0x00
 
-EVENT_BBOX_CYCLES                  0x1B  MBOX
-UMASK_BBOX_CYCLES                  0xFF
+EVENT_MBOX_CLOCKTICKS                0x1B  MBOX
+UMASK_MBOX_CLOCKTICKS                0x00 0x00 0x00
 
-EVENT_CYCLES_DSP_FILL                  0x00  MBOX
-UMASK_CYCLES_DSP_FILL_RDQ_FULL         0x01 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_FULL         0x02 0x01 0x00
-UMASK_CYCLES_DSP_FILL_RDQ_EMPTY        0x04 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_EMPTY        0x08 0x01 0x00
+EVENT_CYCLES_DSP_FILL                0x00  MBOX
+UMASK_CYCLES_DSP_FILL_RDQ_FULL       0x01 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_FULL       0x02 0x01 0x00
+UMASK_CYCLES_DSP_FILL_RDQ_EMPTY      0x04 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_EMPTY      0x08 0x01 0x00
 
-EVENT_CYCLES_MFULL                  0x01  MBOX
-UMASK_CYCLES_MFULL                  0x00 0x01 0x00
+EVENT_CYCLES_MFULL                   0x01  MBOX
+UMASK_CYCLES_MFULL                   0x00 0x00 0x00
 
-EVENT_CYCLES_PGT_STATE                  0x05  MBOX
-UMASK_CYCLES_PGT_STATE_CLOSED           0x00 0x01 0x00
-UMASK_CYCLES_PGT_STATE_OPEN             0x01 0x01 0x00
+EVENT_CYCLES_PGT_STATE               0x05  MBOX
+UMASK_CYCLES_PGT_STATE_CLOSED        0x00 0x01 0x00
+UMASK_CYCLES_PGT_STATE_OPEN          0x01 0x01 0x00
 
-EVENT_CYCLES_RETRYQ_STARVED                  0x04  MBOX
-UMASK_CYCLES_RETRYQ_STARVED                  0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_STARVED          0x04  MBOX
+UMASK_CYCLES_RETRYQ_STARVED          0x00 0x01 0x00
 
-EVENT_CYCLES_RETRYQ_MFULL                  0x03  MBOX
-UMASK_CYCLES_RETRYQ_MFULL                  0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_MFULL            0x03  MBOX
+UMASK_CYCLES_RETRYQ_MFULL            0x00 0x01 0x00
 
 EVENT_CYCLES_SCHED_MODE                  0x01  MBOX
 UMASK_CYCLES_SCHED_MODE_TRADEOFF         0x00 0x01 0x00
@@ -634,34 +625,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO           0x02 0x01 0x00
 UMASK_CYCLES_SCHED_MODE_ADAPTIVE         0x03 0x01 0x00
 
 EVENT_DRAM_CMD                              0x0A  MBOX
+OPTIONS_DRAM_CMD                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_DRAM_CMD_ALL                          0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL                      0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL                      0x00 0x02 0x00
 UMASK_DRAM_CMD_PREALL                       0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x13
 UMASK_DRAM_CMD_RAS                          0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN                   0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN                   0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x13
 UMASK_DRAM_CMD_CAS_RD_CLS                   0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x13
 UMASK_DRAM_CMD_CAS_WR_CLS                   0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x13
 UMASK_DRAM_CMD_MRS                          0x07 0x02 0x00
 UMASK_DRAM_CMD_RFR                          0x09 0x02 0x00
 UMASK_DRAM_CMD_ENSR                         0x0A 0x02 0x00
@@ -688,33 +680,16 @@ UMASK_DSP_FILL_WRQ_FULL                     0x02 0x03 0x00
 UMASK_DSP_FILL_RDQ_EMPTY                    0x04 0x03 0x00
 UMASK_DSP_FILL_WRQ_EMPTY                    0x08 0x03 0x00
 
-EVENT_DRAM_MISC                          0x0B  MBOX
-UMASK_DRAM_MISC_RETRIES_ALL              0x00 0x04 0x03
-UMASK_DRAM_MISC_RETRIES_FVID             0x01 0x04 0x03
-UMASK_DRAM_MISC_VALID                    0x01 0x04 0x02
-UMASK_DRAM_MISC_NON_NOP_TRKL             0x01 0x04 0x01
-
-UMASK_DRAM_MISC_ILLEGAL                  0x00 0x04 0x00
-UMASK_DRAM_MISC_PREALL                   0x01 0x04 0x00
-UMASK_DRAM_MISC_RAS                      0x02 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_OPN               0x03 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_OPN               0x04 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_CLS               0x05 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_CLS               0x06 0x04 0x00
-UMASK_DRAM_MISC_MRS                      0x07 0x04 0x00
-UMASK_DRAM_MISC_RFR                      0x09 0x04 0x00
-UMASK_DRAM_MISC_ENSR                     0x0A 0x04 0x00
-UMASK_DRAM_MISC_EXSR                     0x0B 0x04 0x00
-UMASK_DRAM_MISC_NOP                      0x0C 0x04 0x00
-UMASK_DRAM_MISC_TRKL                     0x10 0x04 0x00
-UMASK_DRAM_MISC_PRE                      0x11 0x04 0x00
-UMASK_DRAM_MISC_SYNC                     0x12 0x04 0x00
-UMASK_DRAM_MISC_CKE_HI                   0x14 0x04 0x00
-UMASK_DRAM_MISC_CKE_LO                   0x15 0x04 0x00
-UMASK_DRAM_MISC_SOFT_RST                 0x17 0x04 0x00
-UMASK_DRAM_MISC_WR_CFG                   0x1C 0x04 0x00
-UMASK_DRAM_MISC_RD_CFG                   0x1D 0x04 0x00
-UMASK_DRAM_MISC_ZQCAL                    0x1E 0x04 0x00
+EVENT_BCMD_SCHEDQ_OCCUPANCY           0x06  MBOX
+UMASK_BCMD_SCHEDQ_OCCUPANCY_READS     0x00 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES    0x01 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE     0x02 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F       0x03 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V       0x04 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V       0x05 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
 
 EVENT_FRM_TYPE                        0x09  MBOX
 UMASK_FRM_TYPE_3CMD                   0x00 0x05 0x00
@@ -750,12 +725,12 @@ UMASK_FVC_EV1_FAST_RESET              0x04 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_READS         0x05 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_WRITES        0x05 0x07 0x01
 UMASK_FVC_EV1_BBOX_RSP_ACK            0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x07
 UMASK_FVC_EV1_SMI_NB_TRIG             0x07 0x07 0x00
 
 EVENT_FVC_EV2                         0x0F  MBOX
@@ -767,30 +742,30 @@ UMASK_FVC_EV2_FAST_RESET              0x04 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_READS         0x05 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_WRITES        0x05 0x08 0x01
 UMASK_FVC_EV2_BBOX_RSP_ACK            0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x07
 UMASK_FVC_EV2_SMI_NB_TRIG             0x07 0x08 0x00
 
 EVENT_FVC_EV3                         0x10  MBOX
 UMASK_FVC_EV3_SMI_CRC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN              0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES            0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET              0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS         0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK            0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY          0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR            0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG             0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR             0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN              0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES            0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET              0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS         0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK            0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY          0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR            0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG             0x07 0x09 0x00
 
 EVENT_FVID_RACE                       0x18  MBOX
 UMASK_FVID_RACE                       0x00 0x00 0x00
@@ -798,9 +773,8 @@ UMASK_FVID_RACE                       0x00 0x00 0x00
 EVENT_INFLIGHT_CMDS                   0x1D  MBOX
 UMASK_INFLIGHT_CMDS                   0x00 0x00 0x00
 
-EVENT_ISS_SCHED                       0x08  MBOX
-UMASK_ISS_SCHED_CHANGES               0x00 0x0A 0x00
-UMASK_ISS_SCHED_FRAME_BEAT            0x01 0x0A 0x00
+EVENT_SCHED_MODE_CHANGES              0x08  MBOX
+UMASK_SCHED_MODE_CHANGES              0x00 0x00 0x00
 
 EVENT_MA_PAR_ERR                      0x0C  MBOX
 UMASK_MA_PAR_ERR                      0x00 0x00 0x00
@@ -808,6 +782,9 @@ UMASK_MA_PAR_ERR                      0x00 0x00 0x00
 EVENT_MULTICAS                        0x17  MBOX
 UMASK_MULTICAS                        0x00 0x00 0x00
 
+EVENT_PAGE_EMPTY                      0x15  MBOX
+UMASK_PAGE_EMPTY                      0x00 0x00 0x00
+
 EVENT_PAGE_HIT                        0x14  MBOX
 UMASK_PAGE_HIT                        0x00 0x00 0x00
 
@@ -821,9 +798,8 @@ EVENT_PGT_PAGE_EV                     0x16  MBOX
 UMASK_PGT_PAGE_EV_OPN2CLS             0x00 0x0B 0x00
 UMASK_PGT_PAGE_EV_CLS2OPN             0x01 0x0B 0x00
 
-EVENT_PGT_PAGE_EV2                    0x15  MBOX
-UMASK_PGT_PAGE_EV2_AUTO_CLS           0x00 0x0C 0x00
-UMASK_PGT_PAGE_EV2_PAGE_EMPTY         0x01 0x0C 0x00
+EVENT_RETRIES                         0x0B  MBOX
+UMASK_RETRIES_ALL                     0x00 0x00 0x00
 
 EVENT_REFRESH                         0x06  MBOX
 UMASK_REFRESH                         0x00 0x00 0x00
@@ -845,12 +821,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE    0x03 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_MID_FALL    0x02 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_LO          0x01 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_LT_LO          0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE  0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE  0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE  0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE  0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL  0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL  0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL  0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL  0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO        0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO        0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO        0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO        0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO        0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO        0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO        0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO        0x00 0x0D 0x04
 
 EVENT_THERM_TRP_UP                    0x04  MBOX
 UMASK_THERM_TRP_UP_ALL_GT_MID_RISE    0x03 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_MID_FALL    0x02 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_LO          0x01 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_LT_LO          0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE  0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE  0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE  0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE  0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL  0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL  0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL  0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL  0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO        0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO        0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO        0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO        0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO        0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO        0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO        0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO        0x00 0x0E 0x04
 
 EVENT_TRANS_CMDS                      0x12  MBOX
 UMASK_TRANS_CMDS                      0x00 0x00 0x00
@@ -859,112 +867,164 @@ EVENT_TT_CMD_CONFLICT                 0x19  MBOX
 UMASK_TT_CMD_CONFLICT                 0x00 0x00 0x00
 
 EVENT_ACK_BEFORE_LAST_SNP             0x19  BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP             0x03
+UMASK_ACK_BEFORE_LAST_SNP             0x00
 
 EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH             0x02
+OPTIONS_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH             0x00
 
 EVENT_CONFLICTS             0x17  BBOX0C3|BBOX1C3
-UMASK_CONFLICTS             0x03
+UMASK_CONFLICTS             0x00
 
 EVENT_COHQ_BYPASS             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS             0x03
+UMASK_COHQ_BYPASS             0x00
 
-EVENT_COHQ_IMT_ALLOC_WAIT             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT             0x03
+EVENT_COHQ_IMT_ALLOC_WAIT             0x13  BBOX0C3|BBOX1C3
+UMASK_COHQ_IMT_ALLOC_WAIT             0x00
 
-EVENT_DIRQ_INSERTS             0x17  BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS             0x01
+EVENT_DIRQ_INSERTS                  0x17  BBOX0C1|BBOX1C1
+UMASK_DIRQ_INSERTS                  0x00
 
 EVENT_DIRQ_OCCUPANCY             0x17  BBOX0C0|BBOX1C0
 UMASK_DIRQ_OCCUPANCY             0x00
 
 EVENT_DEMAND_FETCH             0x0F  BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH             0x03
+UMASK_DEMAND_FETCH             0x00
 
 EVENT_DRSQ_INSERTS             0x09  BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS             0x01
+UMASK_DRSQ_INSERTS             0x00
 
 EVENT_DRSQ_OCCUPANCY             0x09  BBOX0C0|BBOX1C0
 UMASK_DRSQ_OCCUPANCY             0x00
 
 EVENT_EARLY_ACK             0x02  BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK             0x03
+UMASK_EARLY_ACK             0x00
 
 EVENT_IMPLICIT_WBS             0x12  BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS             0x03
+UMASK_IMPLICIT_WBS             0x00
 
-EVENT_IMT_FULL             0x12  BBOX0C3|BBOX1C3
-UMASK_IMT_FULL             0x03
+EVENT_IMT_FULL             0x16  BBOX0C3|BBOX1C3
+UMASK_IMT_FULL             0x00
 
 EVENT_IMT_INSERTS_ALL             0x07  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL             0x01
+UMASK_IMT_INSERTS_ALL             0x00
 
 EVENT_IMT_INSERTS_INVITOE             0x0F  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE             0x01
+UMASK_IMT_INSERTS_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH             0x0A  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH             0x01
+UMASK_IMT_INSERTS_IOH             0x00
 
 EVENT_IMT_INSERTS_IOH_INVITOE             0x10  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_IOH_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH_WR             0x0D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR             0x01
+UMASK_IMT_INSERTS_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_NON_IOH             0x0B  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH             0x01
+UMASK_IMT_INSERTS_NON_IOH             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_INVITOE             0x1C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x00
 
-EVENT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD             0x01
+EVENT_IMT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_WR             0x0E  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR             0x01
+UMASK_IMT_INSERTS_NON_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_RD             0x1D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD             0x01
+UMASK_IMT_INSERTS_RD             0x00
 
 EVENT_IMT_INSERTS_WR             0x0C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR             0x01
+UMASK_IMT_INSERTS_WR             0x00
 
 EVENT_IMT_NE_CYCLES             0x07  BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES             0x02
+UMASK_IMT_NE_CYCLES             0x00
 
 EVENT_IMT_PREALLOC             0x06  BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC             0x03
+UMASK_IMT_PREALLOC             0x00
 
 EVENT_IMT_VALID_OCCUPANCY             0x07  BBOX0C0|BBOX1C0
 UMASK_IMT_VALID_OCCUPANCY             0x00
 
 EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_ADDR_IN_MATCH             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S             0x01
-
 EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S             0x02
+UMASK_MSGS_B_TO_S             0x00
+
+EVENT_MSGS_S_TO_B             0x02  BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B             0x00
 
 EVENT_MSG_IN_MATCH             0x01  BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH             0x01
+OPTIONS_MSG_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH             0x00
 
 EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP             0x02
+UMASK_MSGS_IN_NON_SNP             0x00
 
 EVENT_MSG_OPCODE_ADDR_IN_MATCH             0x03  BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_OPCODE_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_IN_MATCH             0x05  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH             0x01
+OPTIONS_MSG_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_OUT_MATCH             0x06  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH             0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH             0x00
 
 EVENT_MSG_OUT_MATCH             0x02  BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH             0x01
+OPTIONS_MSG_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH             0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH             0x02  BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH             0x00
+
+EVENT_OPCODE_IN_MATCH             0x03  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH             0x00
+
+EVENT_OPCODE_OUT_MATCH             0x04  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH             0x00
+
+EVENT_RBOX_VNA_UNAVAIL              0x15 BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL              0x00
+
+EVENT_SBOX_VN0_UNAVAIL              0x14 BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL              0x00
+
+EVENT_SNPOQ_INSERTS                 0x12 BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS                 0x00
+
+EVENT_SNPOQ_OCCUPANCY               0x12 BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY               0x00
+
+EVENT_TF_ALL                        0x04 BBOX0C0|BBOX1C0
+UMASK_TF_ALL                        0x00
+
+EVENT_TF_INVITOE                    0x06 BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE                    0x00
+
+EVENT_TF_IOH                        0x0B BBOX0C0|BBOX1C0
+UMASK_TF_IOH                        0x00
+
+EVENT_TF_IOH_INVITOE                0x0F BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE                0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD         0x1C BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD         0x00
+
+EVENT_TF_IOH_WR                     0x0D BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR                     0x00
+
+EVENT_TF_WR                         0x05 BBOX0C0|BBOX1C0
+UMASK_TF_WR                         0x00
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX0
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB             0x00 0x01  0x09
@@ -974,7 +1034,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NDR             0x00 0x08  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_SNP             0x00 0x10  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN0         0x00 0x20  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN1         0x00 0x40  0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL             0x00 0xFF  0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL             0x00 0x7F  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCB             0x01 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCS             0x01 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_DRS_VN01        0x01 0x04   0x09
@@ -982,7 +1042,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NDR             0x01 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_SNP             0x01 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN0         0x01 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN1         0x01 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL             0x01 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL             0x01 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCB             0x06 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCS             0x06 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_DRS_VN01        0x06 0x04   0x09
@@ -990,7 +1050,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NDR             0x06 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_SNP             0x06 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN0         0x06 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN1         0x06 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL             0x06 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL             0x06 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCB             0x07 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCS             0x07 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_DRS_VN01        0x07 0x04   0x09
@@ -998,7 +1058,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NDR             0x07 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_SNP             0x07 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN0         0x07 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN1         0x07 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL             0x07 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL             0x07 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCB             0x0C 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCS             0x0C 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_DRS_VN01        0x0C 0x04   0x09
@@ -1006,7 +1066,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NDR             0x0C 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_SNP             0x0C 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN0         0x0C 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN1         0x0C 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL             0x0C 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL             0x0C 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCB             0x0D 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCS             0x0D 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_DRS_VN01        0x0D 0x04   0x09
@@ -1014,7 +1074,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NDR             0x0D 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_SNP             0x0D 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN0         0x0D 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN1         0x0D 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL             0x0D 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL             0x0D 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCB             0x12 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCS             0x12 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_DRS_VN01        0x12 0x04   0x09
@@ -1022,7 +1082,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NDR             0x12 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_SNP             0x12 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN0         0x12 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN1         0x12 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL             0x12 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL             0x12 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCB             0x13 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCS             0x13 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_DRS_VN01        0x13 0x04   0x09
@@ -1030,7 +1090,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NDR             0x13 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_SNP             0x13 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN0         0x13 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN1         0x13 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL             0x13 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL             0x13 0x7F   0x09
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX1
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NCB             0x00 0x01  0x09
@@ -1040,7 +1100,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NDR             0x00 0x08  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_SNP             0x00 0x10  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN0         0x00 0x20  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN1         0x00 0x40  0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL             0x00 0xFF  0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL             0x00 0x7F  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCB             0x01 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCS             0x01 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_DRS_VN01        0x01 0x04   0x09
@@ -1048,7 +1108,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NDR             0x01 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_SNP             0x01 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN0         0x01 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN1         0x01 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL             0x01 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL             0x01 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCB             0x06 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCS             0x06 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_DRS_VN01        0x06 0x04   0x09
@@ -1056,7 +1116,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NDR             0x06 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_SNP             0x06 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN0         0x06 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN1         0x06 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL             0x06 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL             0x06 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCB             0x07 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCS             0x07 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_DRS_VN01        0x07 0x04   0x09
@@ -1064,7 +1124,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NDR             0x07 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_SNP             0x07 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN0         0x07 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN1         0x07 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL             0x07 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL             0x07 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCB             0x0C 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCS             0x0C 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_DRS_VN01        0x0C 0x04   0x09
@@ -1072,7 +1132,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NDR             0x0C 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_SNP             0x0C 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN0         0x0C 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN1         0x0C 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL             0x0C 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL             0x0C 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCB             0x0D 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCS             0x0D 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_DRS_VN01        0x0D 0x04   0x09
@@ -1080,7 +1140,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NDR             0x0D 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_SNP             0x0D 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN0         0x0D 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN1         0x0D 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL             0x0D 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL             0x0D 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCB             0x12 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCS             0x12 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_DRS_VN01        0x12 0x04   0x09
@@ -1088,7 +1148,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NDR             0x12 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_SNP             0x12 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN0         0x12 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN1         0x12 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL             0x12 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL             0x12 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCB             0x13 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCS             0x13 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_DRS_VN01        0x13 0x04   0x09
@@ -1096,7 +1156,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NDR             0x13 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_SNP             0x13 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN0         0x13 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN1         0x13 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL             0x13 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL             0x13 0x7F   0x09
 
 
 EVENT_EOT_INSERTS                             0x00  RBOX0
@@ -2236,7 +2296,7 @@ UMASK_QUE_ARB_BID_PORT0_QLX0_HOM        0x02 0x00 0x00
 UMASK_QUE_ARB_BID_PORT0_QLX0_SNP        0x02 0x00 0x01
 UMASK_QUE_ARB_BID_PORT0_QLX0_NDR        0x02 0x00 0x02
 UMASK_QUE_ARB_BID_PORT0_QLX0_NCS        0x02 0x00 0x03
-UMASK_QUE_ARB_BID_PORT0_QLX0_DRS        0x02 0x00 0x02
+UMASK_QUE_ARB_BID_PORT0_QLX0_DRS        0x02 0x00 0x04
 UMASK_QUE_ARB_BID_PORT0_QLX0_NCB        0x02 0x00 0x05
 UMASK_QUE_ARB_BID_PORT0_QLX1_HOM        0x03 0x00 0x00
 UMASK_QUE_ARB_BID_PORT0_QLX1_SNP        0x03 0x00 0x01
@@ -3313,6 +3373,7 @@ EVENT_TO_R_NDR_MSGQ_OCCUPANCY                   0x0D SBOX
 UMASK_TO_R_NDR_MSGQ_OCCUPANCY                   0x00
 
 EVENT_TO_R_PROG_EV                              0x00 SBOX
+OPTIONS_TO_R_PROG_EV                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_TO_R_PROG_EV                              0x00
 
 EVENT_TO_R_B_REQUESTS                           0x6C SBOX
diff --git a/src/includes/perfmon_nehalemEX_westmereEX_common.h b/src/includes/perfmon_nehalemEX_westmereEX_common.h
new file mode 100644
index 0000000..655d5c0
--- /dev/null
+++ b/src/includes/perfmon_nehalemEX_westmereEX_common.h
@@ -0,0 +1,94 @@
+#ifndef PERFMON_NEX_WEX_COMMON
+#define PERFMON_NEX_WEX_COMMON
+
+#include <registers.h>
+
+enum nex_wex_mbox_reg_ids {
+    ADDR_MATCH = 0,
+    ADDR_MASK,
+    ZDP,
+    DSP,
+    ISS,
+    PGT,
+    MAP,
+    PLD,
+    THR,
+    NUM_MBOX_IDS
+};
+
+static uint64_t nex_wex_mbox_regs[2][NUM_MBOX_IDS] = {
+    [0] = {
+        [ADDR_MATCH] = MSR_M0_PMON_ADDR_MATCH,
+        [ADDR_MASK] = MSR_M0_PMON_ADDR_MASK,
+        [ZDP] = MSR_M0_PMON_ZDP,
+        [DSP] = MSR_M0_PMON_DSP,
+        [ISS] = MSR_M0_PMON_ISS,
+        [PGT] = MSR_M0_PMON_PGT,
+        [MAP] = MSR_M0_PMON_MAP,
+        [PLD] = MSR_M0_PMON_PLD,
+        [THR] = MSR_M0_PMON_MSC_THR,
+    },
+    [1] = {
+        [ADDR_MATCH] = MSR_M1_PMON_ADDR_MATCH,
+        [ADDR_MASK] = MSR_M1_PMON_ADDR_MASK,
+        [ZDP] = MSR_M1_PMON_ZDP,
+        [DSP] = MSR_M1_PMON_DSP,
+        [ISS] = MSR_M1_PMON_ISS,
+        [PGT] = MSR_M1_PMON_PGT,
+        [MAP] = MSR_M1_PMON_MAP,
+        [PLD] = MSR_M1_PMON_PLD,
+        [THR] = MSR_M1_PMON_MSC_THR,
+    },
+};
+
+enum nex_wex_rbox_reg_type {
+    IPERF0 = 0,
+    IPERF1,
+    QLX,
+    NUM_RBOX_REG_TYPES
+};
+
+static uint64_t nex_wex_rbox_regs[2][NUM_RBOX_REG_TYPES][4] = {
+    [0] = {
+        [IPERF0] = {
+            [0] = MSR_R0_PMON_IPERF0_P0,
+            [1] = MSR_R0_PMON_IPERF0_P1,
+            [2] = MSR_R0_PMON_IPERF0_P2,
+            [3] = MSR_R0_PMON_IPERF0_P3,
+        },
+        [IPERF1] = {
+            [0] = MSR_R0_PMON_IPERF1_P0,
+            [1] = MSR_R0_PMON_IPERF1_P1,
+            [2] = MSR_R0_PMON_IPERF1_P2,
+            [3] = MSR_R0_PMON_IPERF1_P3,
+        },
+        [QLX] = {
+            [0] = MSR_R0_PMON_QLX_P0,
+            [1] = MSR_R0_PMON_QLX_P1,
+            [2] = MSR_R0_PMON_QLX_P2,
+            [3] = MSR_R0_PMON_QLX_P3,
+        },
+    },
+    [1] = {
+        [IPERF0] = {
+            [0] = MSR_R1_PMON_IPERF0_P0,
+            [1] = MSR_R1_PMON_IPERF0_P1,
+            [2] = MSR_R1_PMON_IPERF0_P2,
+            [3] = MSR_R1_PMON_IPERF0_P3,
+        },
+        [IPERF1] = {
+            [0] = MSR_R1_PMON_IPERF1_P0,
+            [1] = MSR_R1_PMON_IPERF1_P1,
+            [2] = MSR_R1_PMON_IPERF1_P2,
+            [3] = MSR_R1_PMON_IPERF1_P3,
+        },
+        [QLX] = {
+            [0] = MSR_R1_PMON_QLX_P0,
+            [1] = MSR_R1_PMON_QLX_P1,
+            [2] = MSR_R1_PMON_QLX_P2,
+            [3] = MSR_R1_PMON_QLX_P3,
+        },
+    },
+};
+
+#endif
diff --git a/src/includes/perfmon_nehalem_counters.h b/src/includes/perfmon_nehalem_counters.h
index d3831c1..55d0d88 100644
--- a/src/includes/perfmon_nehalem_counters.h
+++ b/src/includes/perfmon_nehalem_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalem_counters.h
  *
- *      Description:  Counter Header File of perfmon module for Nehalem.
+ *      Description:  Counter Header File of perfmon module for Intel Nehalem.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,27 +30,38 @@
  */
 
 #define NUM_COUNTERS_CORE_NEHALEM 7
-#define NUM_COUNTERS_UNCORE_NEHALEM 15
-#define NUM_COUNTERS_NEHALEM 15
+#define NUM_COUNTERS_UNCORE_NEHALEM 16
+#define NUM_COUNTERS_NEHALEM 16
 
-static PerfmonCounterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
+#define NEH_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEH_VALID_OPTIONS_PMC EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define NEH_VALID_OPTIONS_UNCORE EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+
+static RegisterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEH_VALID_OPTIONS_FIXED},
+    {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEH_VALID_OPTIONS_FIXED},
+    {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEH_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEH_VALID_OPTIONS_PMC},
     /* Uncore PMC Counters: 8 48bit wide */
-    {"UPMC0",PMC7,  UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0},
-    {"UPMC1",PMC8,  UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0},
-    {"UPMC2",PMC9,  UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0},
-    {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0},
-    {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0},
-    {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0},
-    {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0},
-    {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0}
+    {"UPMC0",PMC7,  UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC1",PMC8,  UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC2",PMC9,  UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMCFIX",PMC15, UNCORE, MSR_UNCORE_FIXED_CTR_CTRL, MSR_UNCORE_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK}
+};
+
+static BoxMap nehalem_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [UNCORE] = {MSR_UNCORE_PERF_GLOBAL_CTRL, MSR_UNCORE_PERF_GLOBAL_STATUS, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48}
 };
 
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index 0eeed50..a17b55e 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_nehalem_events.txt
-# 
+#
 #      Description:  Event list for Intel Nehalem
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -61,7 +62,7 @@ UMASK_STORE_BLOCK_ANY            0x0F
 EVENT_PARTIAL_ADDRESS_ALIAS      0x07  PMC
 UMASK_PARTIAL_ADDRESS_ALIAS      0x01
 
-EVENT_DTLB_LOAD_MISSES                0x08  PMC
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
 UMASK_DTLB_LOAD_MISSES_ANY             0x01
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x02
 UMASK_DTLB_LOAD_MISSES_STLB_HIT        0x10
@@ -531,6 +532,13 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK                 0x00 UPMCFIX
+UMASK_UNCORE_CLOCK                 0x00
+
 EVENT_UNC_GQ_CYCLES_FULL                0x00   UPMC
 UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER         0x01
 UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER        0x02
@@ -720,6 +728,12 @@ UMASK_UNC_QMC_PRIORITY_UPDATES_ANY            0x07
 EVENT_UNC_QHL_FRC_ACK_CNFLTS_LOCAL            0x33   UPMC
 UMASK_UNC_QHL_FRC_ACK_CNFLTS_LOCAL            0x04
 
+EVENT_UNC_ADDR_OPCODE_MATCH                   0x35  UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR                0x00 0x06
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND               0x01 0x01
+
 EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT            0x40   UPMC
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0      0x01
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_SNOOP_LINK_0     0x02
@@ -788,4 +802,3 @@ EVENT_UNC_DRAM_PRE_ALL                  0x66   UPMC
 UMASK_UNC_DRAM_PRE_ALL_CH0              0x01
 UMASK_UNC_DRAM_PRE_ALL_CH1              0x02
 UMASK_UNC_DRAM_PRE_ALL_CH2              0x04
-
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 0db8338..9ad1cbc 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_p6_events.txt
-# 
-#      Description:  Event list for Pentium 3
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Pentium 3
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_perf.h b/src/includes/perfmon_perf.h
new file mode 100644
index 0000000..8927d51
--- /dev/null
+++ b/src/includes/perfmon_perf.h
@@ -0,0 +1,60 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_ivybridgeEP_counters.h
+ *
+ *      Description: Header file of example perfmon module for software events using
+ *                   the perf_event interface
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PERFMON_PERF_H
+#define PERFMON_PERF_H
+
+#include <perfmon_types.h>
+
+#define MAX_SW_EVENTS 9
+
+
+extern int init_perf_event(int cpu_id);
+
+extern int setup_perf_event(int cpu_id, PerfmonEvent *event);
+
+extern int read_perf_event(int cpu_id, uint64_t eventID, uint64_t *data);
+
+extern int stop_perf_event(int cpu_id, uint64_t eventID);
+extern int stop_all_perf_event(int cpu_id);
+
+extern int clear_perf_event(int cpu_id, uint64_t eventID);
+extern int clear_all_perf_event(int cpu_id);
+
+extern int start_perf_event(int cpu_id, uint64_t eventID);
+extern int start_all_perf_event(int cpu_id);
+
+extern int close_perf_event(int cpu_id, uint64_t eventID);
+
+extern int finalize_perf_event(int cpu_id);
+
+#endif
diff --git a/src/includes/perfmon_phi.h b/src/includes/perfmon_phi.h
index 0f5dd54..ecf31bb 100644
--- a/src/includes/perfmon_phi.h
+++ b/src/includes/perfmon_phi.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_phi.h
  *
- *      Description:  Header File of perfmon module for Xeon Phi.
+ *      Description:  Header File of perfmon module for Intel Xeon Phi.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,127 +30,207 @@
  */
 
 #include <perfmon_phi_events.h>
-#include <perfmon_phi_groups.h>
 #include <perfmon_phi_counters.h>
+#include <error.h>
+#include <affinity.h>
 
 static int perfmon_numCountersPhi = NUM_COUNTERS_PHI;
-static int perfmon_numGroupsPhi = NUM_GROUPS_PHI;
 static int perfmon_numArchEventsPhi = NUM_ARCH_EVENTS_PHI;
 
-void perfmon_init_phi(PerfmonThread *thread)
+int perfmon_init_phi(int cpu_id)
 {
-    uint32_t flags = 0x0UL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, 0x0UL);
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, 0x0UL);
-    msr_write(cpu_id, MSR_MIC_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-
-    flags |= (1<<16);  /* user mode flag */
-    flags |= (1<<22);  /* enable flag */
-
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, flags);
+    return 0;
 }
 
-void perfmon_setupCounterThread_phi(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int phi_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     uint64_t flags = 0x0ULL;
-    uint64_t reg = phi_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    if (phi_counter_map[index].type == PMC)
+    flags |= (1ULL<<16)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
     {
-        flags = (1<<22)|(1<<16);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+        for(int j=0;j<event->numberOfOptions;j++)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) <<24;
+                    break;
+                default:
+                    break;
+            }
         }
     }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-void perfmon_startCountersThread_phi(int thread_id)
+int perfmon_setupCounterThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags = 0ULL;
-    int processorId = perfmon_threadData[thread_id].processorId;
-
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            msr_write(processorId, phi_counter_map[i].counterRegister , 0x0ULL);
-            flags |= (1<<(i));  /* enable counter */
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        if (type == PMC)
+        {
+            phi_pmc_setup(cpu_id, index, event);
+            eventSet->events[i].threadCounter[thread_id].init = TRUE;
         }
     }
+    return 0;
+}
+
+int perfmon_startCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_verbose)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_MIC_PERF_GLOBAL_CTRL, LLU_CAST flags);
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+            flags |= (1ULL<<(index));  /* enable counter */
+        }
     }
 
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, flags);
-    flags |= (1ULL<<63);
-    msr_write(processorId, MSR_MIC_SPFLT_CONTROL, flags);
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x000000003ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, flags));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, flags|(1ULL<<63)));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, flags));
+    return 0;
 }
 
-void perfmon_stopCountersThread_phi(int thread_id)
+int perfmon_stopCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
 
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, phi_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, phi_counter_map[index].counterRegister, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                uint64_t ovf_values = 0x0ULL;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+                if (ovf_values & (1ULL<<index))
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+                }
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
+}
+
+int perfmon_readCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_flags = 0x0ULL;
 
-    flags = msr_read(cpu_id,MSR_MIC_PERF_GLOBAL_STATUS);
-//    printf ("Status: 0x%llX \n", LLU_CAST flags);
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, &core_flags));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
 
-    if((flags & 0x3))
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf ("Overflow occured \n");
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[i].counterRegister, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                uint64_t ovf_values = 0x0ULL;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+                if (ovf_values & (1ULL<<index))
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+                }
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
     }
+
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, core_flags|(1ULL<<63)));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, core_flags));
+    return 0;
 }
 
-void perfmon_readCountersThread_phi(int thread_id)
+
+int perfmon_finalizeCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = 0x0ULL;
 
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, phi_counter_map[i].counterRegister);
+            continue;
         }
+        RegisterIndex index = eventSet->events[i].index;
+        ovf_values_core |= (1ULL<<(index));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[i].configRegister, 0x0ULL));
     }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    return 0;
 }
-
diff --git a/src/includes/perfmon_phi_counters.h b/src/includes/perfmon_phi_counters.h
index edf0658..5bd8010 100644
--- a/src/includes/perfmon_phi_counters.h
+++ b/src/includes/perfmon_phi_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_phi_counters.h
  *
- *      Description: Counter Header File of perfmon module.
+ *      Description: Counter Header File of perfmon module for Intel Xeon Phi.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,14 @@
 #define NUM_COUNTERS_PHI 2
 #define NUM_COUNTERS_CORE_PHI 2
 
-static PerfmonCounterMap phi_counter_map[NUM_COUNTERS_PHI] = {
-    {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0},
-    {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0}
+#define PHI_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+                              EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap phi_counter_map[NUM_COUNTERS_PHI] = {
+    {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0, PHI_VALID_OPTIONS_PMC},
+    {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0, PHI_VALID_OPTIONS_PMC}
 };
 
+static BoxMap phi_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_MIC_PERF_GLOBAL_CTRL, MSR_MIC_PERF_GLOBAL_STATUS, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 40}
+};
diff --git a/src/includes/perfmon_phi_events.txt b/src/includes/perfmon_phi_events.txt
index d6393ba..1c5434e 100644
--- a/src/includes/perfmon_phi_events.txt
+++ b/src/includes/perfmon_phi_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_phi_events.txt
-# 
+#
 #      Description:  Event list for Intel Xeon Phi
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_pm.h b/src/includes/perfmon_pm.h
index 88346d1..73beaf2 100644
--- a/src/includes/perfmon_pm.h
+++ b/src/includes/perfmon_pm.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header File of perfmon module Pentium M.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -30,136 +31,202 @@
 
 #include <perfmon_pm_events.h>
 #include <perfmon_pm_counters.h>
+#include <error.h>
+#include <affinity.h>
 
-#define NUM_GROUPS_PM 5
 
 static int perfmon_numCounters_pm = NUM_COUNTERS_PM;
-static int perfmon_numGroups_pm = NUM_GROUPS_PM;
 static int perfmon_numArchEvents_pm = NUM_ARCH_EVENTS_PM;
 
-static PerfmonGroupMap pm_group_map[NUM_GROUPS_PM] = {
-	{"FLOPS_DP",FLOPS_DP,0,"Double Precision MFlops/s",
-        "EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP:PMC1"},
-	{"FLOPS_SP",FLOPS_SP,0,"Single Precision MFlops/s",
-        "EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP:PMC1"},
-	{"L2",L2,0,"L2 cache bandwidth in MBytes/s",
-        "L2_LINES_IN_ALL_ALL:PMC0,L2_LINES_OUT_ALL_ALL:PMC1"},
-	{"BRANCH",BRANCH,0,"Branch prediction miss rate",
-        "BR_INST_EXEC:PMC0,BR_INST_MISSP_EXEC:PMC1"},
-	{"CPI",CPI,0,"Cycles per instruction","UOPS_RETIRED:PMC0"}
-};
-
-void perfmon_init_pm(PerfmonThread *thread)
-{
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
 
-    /* Preinit of two PMC counters */
-    //flags |= (1<<16);  /* user mode flag */
-    //flags |= (1<<19);  /* pin control flag */
-    //    flags |= (1<<22);  /* enable flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);*/
+int perfmon_init_pm(int cpu_id)
+{
+    return 0;
 }
 
-void perfmon_setupCounterThread_pm(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int pm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    uint64_t flags;
-    uint64_t reg = pm_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-    flags = (1<<16)|(1<<19);
+    uint64_t flags = 0x0ULL;
 
-    /* Intel with standard 8 bit event mask: [7:0] */
+    flags = (1ULL<<16)|(1ULL<<19);
     flags |= (event->umask<<8) + event->eventId;
 
-    msr_write(cpu_id, reg , flags);
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int perfmon_setupCounterThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        pm_pmc_setup(cpu_id, index, event);
     }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_pm(int thread_id)
+int perfmon_startCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t flags = 0ULL;
-    int processorId = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_threadData[thread_id].counters[0].init == TRUE)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST 0x0ULL, SETUP_PMC_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+        }
+    }
+    if (eventSet->numberOfEvents > 0)
     {
-        msr_write(processorId, pm_counter_map[0].counterRegister , 0x0ULL);
-        msr_write(processorId, pm_counter_map[1].counterRegister , 0x0ULL);
-
         /* on p6 only MSR_PERFEVTSEL0 has the enable bit
          * it enables both counters as long MSR_PERFEVTSEL1 
          * has a valid configuration */
-        flags = msr_read(processorId, MSR_PERFEVTSEL0);
-        flags |= (1<<22);  /* enable flag */
-
-        if (perfmon_verbose)
-        {
-            printf("perfmon_start_counters: Write Register 0x%X , \
-                    Flags: 0x%llX \n",MSR_PERFEVTSEL0, LLU_CAST flags);
-        }
-
-        msr_write(processorId, MSR_PERFEVTSEL0, flags);
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &flags));
+        flags |= (1<<22);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, LLU_CAST flags, UNFREEZE_PMC);
     }
-
+    return 0;
 }
 
-void perfmon_stopCountersThread_pm(int thread_id)
+int perfmon_stopCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
-    int i;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &counter_result));
+    counter_result &= ~(1<<22);
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, counter_result, FREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, counter_result));
 
-    for (i=0;i<NUM_COUNTERS_PM;i++) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, pm_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, counter_result, READ_PMC);
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_printDerivedMetrics_pm(PerfmonGroup group)
+int perfmon_readCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
+    uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    switch ( group )
-    {
-        case FLOPS_DP:
-
-        case FLOPS_SP:
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &pmc_flags));
+    pmc_flags &= ~(1<<22);
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22), FREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22)));
 
-        case L2:
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
 
-        case BRANCH:
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags, UNFREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags));
+    return 0;
+}
 
-        case _NOGROUP:
-            fprintf (stderr, "The Pentium M supports only two counters. Therefore derived metrics are not computed due to missing runtime!\n" );
-            break;
+int perfmon_finalizeCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-        default:
-            fprintf (stderr, "perfmon_printDerivedMetricsCore2: Unknown group! Exiting!\n" );
-            exit (EXIT_FAILURE);
-            break;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if ((reg) && ((type == PMC)||(type == FIXED)))
+        {
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, reg, 0x0ULL, CLEAR_CTL);
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    return 0;
 }
 
-
diff --git a/src/includes/perfmon_pm_counters.h b/src/includes/perfmon_pm_counters.h
index 9119096..7e0d6da 100644
--- a/src/includes/perfmon_pm_counters.h
+++ b/src/includes/perfmon_pm_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_pm_counters.h
  *
- *      Description: Counter Header File of perfmon module.
+ *      Description: Counter Header File of perfmon module for Intel Pentium M.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,13 @@
 #define NUM_COUNTERS_PM 2
 #define NUM_COUNTERS_CORE_PM 2
 
-static PerfmonCounterMap pm_counter_map[NUM_COUNTERS_PM] = {
-    {"PMC0",PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+#define PM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap pm_counter_map[NUM_COUNTERS_PM] = {
+    {"PMC0", PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, PM_VALID_OPTIONS_PMC},
+    {"PMC1", PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, PM_VALID_OPTIONS_PMC}
 };
 
+static BoxMap pm_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 9ed83a8..45fd7f4 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_pm_events.txt
-# 
+#
 #      Description:  Event list for Intel Pentium M
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -262,22 +262,22 @@ UMASK_BUS_HITM_DRV_SELF           0x00
 EVENT_BUS_SNOOP_STALL             0x7E      PMC
 UMASK_BUS_SNOOP_STALL_SELF        0x00
 
-EVENT_FLOPS                       0xC1      PMC
+EVENT_FLOPS                       0xC1      PMC0
 UMASK_FLOPS                       0x00
 
-EVENT_FP_COMP_OPS_EXE             0x10      PMC
+EVENT_FP_COMP_OPS_EXE             0x10      PMC0
 UMASK_FP_COMP_OPS_EXE             0x00
 
-EVENT_FP_ASSIST                   0x11      PMC
+EVENT_FP_ASSIST                   0x11      PMC1
 UMASK_FP_ASSIST                   0x00
 
-EVENT_MUL                         0x12      PMC
+EVENT_MUL                         0x12      PMC1
 UMASK_MUL                         0x00
 
-EVENT_DIV                         0x13      PMC
+EVENT_DIV                         0x13      PMC1
 UMASK_DIV                         0x00
 
-EVENT_CYCLES_DIV_BUSY             0x14      PMC
+EVENT_CYCLES_DIV_BUSY             0x14      PMC0
 UMASK_CYCLES_DIV_BUSY             0x00
 
 EVENT_LD_BLOCKS                   0x03      PMC
@@ -289,13 +289,13 @@ UMASK_SB_DRAINS                   0x00
 EVENT_MISALIGN_MEM_REF            0x05      PMC
 UMASK_MISALIGN_MEM_REF            0x00
 
-EVENT_EMON_KNI_PREF_DISPATCHED       0x07      PMC
+EVENT_EMON_KNI_PREF_DISPATCHED       0x07      PMC0|PMC1
 UMASK_EMON_KNI_PREF_DISPATCHED_NTA   0x00
 UMASK_EMON_KNI_PREF_DISPATCHED_T1    0x01
 UMASK_EMON_KNI_PREF_DISPATCHED_T2    0x02
 UMASK_EMON_KNI_PREF_DISPATCHED_WEAK  0x03
 
-EVENT_EMON_KNI_PREF_MISS        0x4B      PMC
+EVENT_EMON_KNI_PREF_MISS        0x4B      PMC0|PMC1
 UMASK_EMON_KNI_PREF_MISS_NTA    0x00
 UMASK_EMON_KNI_PREF_MISS_T1     0x01
 UMASK_EMON_KNI_PREF_MISS_T2     0x02
@@ -310,13 +310,13 @@ UMASK_UOPS_RETIRED             0x00
 EVENT_INST_DECODED             0xD0      PMC
 UMASK_INST_DECODED             0x00
 
-EVENT_EMON_SSE_SSE2_INST_RETIRED                0xD8      PMC
+EVENT_EMON_SSE_SSE2_INST_RETIRED                0xD8      PMC0|PMC1
 UMASK_EMON_SSE_SSE2_INST_RETIRED_ALL_SP         0x00
 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_SP      0x01
 UMASK_EMON_SSE_SSE2_INST_RETIRED_PACKED_DP      0x02
 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_DP      0x03
 
-EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED                0xD9      PMC
+EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED                0xD9      PMC0|PMC1
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP         0x00
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP      0x01
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP      0x02
diff --git a/src/includes/perfmon_sandybridge.h b/src/includes/perfmon_sandybridge.h
index f11714a..f6f9665 100644
--- a/src/includes/perfmon_sandybridge.h
+++ b/src/includes/perfmon_sandybridge.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_sandybridge.h
  *
- *      Description:  Header File of perfmon module for Sandy Bridge.
+ *      Description:  Header File of perfmon module for Intel Sandy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,666 +30,1858 @@
  */
 
 #include <perfmon_sandybridge_events.h>
-#include <perfmon_sandybridge_groups.h>
 #include <perfmon_sandybridge_counters.h>
-
+#include <perfmon_sandybridgeEP_events.h>
+#include <perfmon_sandybridgeEP_counters.h>
+#include <error.h>
+#include <affinity.h>
+
+static int perfmon_numCountersSandybridgeEP = NUM_COUNTERS_SANDYBRIDGEEP;
+static int perfmon_numCoreCountersSandybridgeEP = NUM_COUNTERS_CORE_SANDYBRIDGEEP;
+static int perfmon_numArchEventsSandybridgeEP = NUM_ARCH_EVENTS_SANDYBRIDGEEP;
 static int perfmon_numCountersSandybridge = NUM_COUNTERS_SANDYBRIDGE;
-static int perfmon_numGroupsSandybridge = NUM_GROUPS_SANDYBRIDGE;
+static int perfmon_numCoreCountersSandybridge = NUM_COUNTERS_CORE_SANDYBRIDGE;
 static int perfmon_numArchEventsSandybridge = NUM_ARCH_EVENTS_SANDYBRIDGE;
 
-#define OFFSET_PMC 3
+int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int snbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event);
+int (*sandy_cbox_setup)(int, RegisterIndex, PerfmonEvent*);
 
-void perfmon_init_sandybridge(PerfmonThread *thread)
+int perfmon_init_sandybridge(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* TODO Robust implementation which also works if stuff is not there */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        if ( cpuid_info.model == SANDYBRIDGE_EP )
+    int ret;
+    uint64_t data = 0x0ULL;
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    ret = HPMwrite(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, &data);
+    ret += HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+    ret += HPMread(cpu_id, MSR_DEV, MSR_UNC_CBO_0_PERFEVTSEL0, &data);
+    if ((cpuid_info.model == SANDYBRIDGE_EP))
+    {
+        sandy_cbox_setup = snbep_cbox_setup;
+    }
+    else if ((ret == 0) && (data == 0x0ULL))
+    {
+        sandy_cbox_setup = snb_cbox_setup;
+    }
+    
+    return 0;
+}
+
+uint32_t snb_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int snb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags |= (1ULL<<22);  /* enable flag */
+    flags |= (1ULL<<16);  /* user mode flag */
+
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j=0;j<event->numberOfOptions;j++)
         {
-            /* Only root can access pci address space in direct mode */
-            if (accessClient_mode != DAEMON_AM_DIRECT)
+            switch (event->options[j].type)
             {
-                uint32_t  uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                uflags = 0x0U;
-                uflags |= (1<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_3, uflags);
-
-                uflags |= (1<<19);  /* reset fixed counter */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                /* iMC counters need to be manually reset to zero */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
-                /* FIXME: Not yet tested/ working due to BIOS issues on test
-                 * machines */
-#if 0
-                /* QPI registers can be zeroed with single write */
-                uflags = 0x0113UL; /*enable freeze (bit 16), freeze (bit 8), reset */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                uflags = 0x0UL;
-                uflags |= (1UL<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-#endif
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
             }
         }
     }
-//    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-#define BOX_GATE_SNB(channel,label) \
-    if (perfmon_verbose) { \
-        printf("[%d] perfmon_setup_counter (label): Write Register 0x%llX , Flags: 0x%llX \n", \
-                cpu_id, \
-                LLU_CAST reg, \
-                LLU_CAST flags); \
-    } \
-    if(haveLock) { \
-        uflags = (1<<22); \
-        uflags |= (event->umask<<8) + event->eventId;  \
-        pci_write(cpu_id, channel,  reg, uflags);  \
+int snb_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL)<<24);
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_MBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
     }
+    return 0;
+}
 
 
-void perfmon_setupCounterThread_sandybridge(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+uint32_t snb_cbox_filter(PerfmonEvent *event)
 {
-    int haveLock = 0;
-    uint64_t flags;
-    uint32_t uflags;
-    uint64_t reg = sandybridge_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    int j;
+    uint32_t ret = 0x0;
+    uint64_t mask = 0x0ULL;
+    int set_state = 0;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    for(j=0;j<event->numberOfOptions;j++)
     {
-        haveLock = 1;
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_OPCODE:
+                if ((event->options[j].value == 0x180) ||
+                    (event->options[j].value == 0x181) ||
+                    (event->options[j].value == 0x182) ||
+                    (event->options[j].value == 0x187) ||
+                    (event->options[j].value == 0x18C) ||
+                    (event->options[j].value == 0x18D) ||
+                    (event->options[j].value == 0x190) ||
+                    (event->options[j].value == 0x191) ||
+                    (event->options[j].value == 0x192) ||
+                    (event->options[j].value == 0x194) ||
+                    (event->options[j].value == 0x195) ||
+                    (event->options[j].value == 0x19C) ||
+                    (event->options[j].value == 0x19E) ||
+                    (event->options[j].value == 0x1C4) ||
+                    (event->options[j].value == 0x1C5) ||
+                    (event->options[j].value == 0x1C8) ||
+                    (event->options[j].value == 0x1E4) ||
+                    (event->options[j].value == 0x1E5) ||
+                    (event->options[j].value == 0x1E6))
+                {
+                    ret |= ((event->options[j].value & 0x1FFULL) << 23);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for opcode option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_STATE:
+                if (event->options[j].value & 0x3F)
+                {
+                    ret |= ((event->options[j].value & 0x3FULL) << 17);
+                    set_state = 1;
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for state option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_NID:
+                mask = 0x0ULL;
+                for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+                    mask |= (1ULL<<i);
+                if (event->options[j].value & mask)
+                {
+                    ret |= ((event->options[j].value & 0xFFULL) << 10);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for node id option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_TID:
+                if (event->options[j].value <= 0xF)
+                {
+                    ret |= (event->options[j].value & 0x1FULL);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for thread id option, LLU_CAST event->options[j].value);
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    if ((event->eventId == 0x34) && (set_state == 0))
+    {
+        ret |= (0x1FULL << 18);
     }
+    return ret;
+}
+
+int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
 
-    switch (sandybridge_counter_map[index].type)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        case PMC:
+        return 0;
+    }
 
-            //flags = msr_read(cpu_id,reg);
-            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
-            flags = (1<<22)|(1<<16);
+    flags |= (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= (event->options[j].value & 0x1FULL)<<24;
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int snbep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-            if (perfmon_verbose)
-            {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
-            }
+    flags |= (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
 
-            msr_write(cpu_id, reg , flags);
-            break;
+    if (event->numberOfOptions > 0)
+    {
+        uint32_t optflags = snb_cbox_filter(event);
+        uint32_t filter_reg = box_map[counter_map[index].type].filterRegister1;
+        if (optflags != 0x0U)
+        {
+            VERBOSEPRINTREG(cpu_id, filter_reg, LLU_CAST optflags, SETUP_CBOX_FILTER);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter_reg, optflags));
+        }
+    }
 
-        case FIXED:
-            fixed_flags |= (0x2 << (index*4));
-            break;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_TID:
+                flags |= (1ULL<<19);
+                break;
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= (event->options[j].value & 0xFFULL)<<24;
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-        case POWER:
-            break;
 
-        case MBOX0:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_0,MBOX0);
-            break;
+int snb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
 
-        case MBOX1:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_1,MBOX1);
-            break;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-        case MBOX2:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_2,MBOX2);
-            break;
+    flags |= (1ULL<<17);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0x1FULL) << 24);
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOX)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-        case MBOX3:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_3,MBOX3);
-            break;
+int snb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    uint64_t match = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
 
-        case SBOX0:
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            case EVENT_OPTION_OPCODE:
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                    LLU_CAST (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                    (event->options[j].value & 0x3FULL)));
+                break;
+            case EVENT_OPTION_MATCH0:
+                match = event->options[j].value & 0xFFFFFFC0ULL;
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, LLU_CAST match, SETUP_BBOX_MATCH0);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, match));
+                match = (event->options[j].value >> 32) & 0x3FFFULL;
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, LLU_CAST match, SETUP_BBOX_MATCH1);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, match));
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_BBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,  counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
-            {
-                if(haveLock)
+
+int snb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= event->eventId & 0xFF;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0x1FULL) << 24);
+                break;
+            case EVENT_OPTION_OCCUPANCY:
+                flags |= ((event->options[j].value & 0x3ULL) << 14);
+                break;
+            case EVENT_OPTION_OCCUPANCY_EDGE:
+                flags |= (1ULL<<31);
+                break;
+            case EVENT_OPTION_OCCUPANCY_INVERT:
+                flags |= (1ULL<<30);
+                break;
+            case EVENT_OPTION_OCCUPANCY_FILTER:
+                VERBOSEPRINTREG(cpu_id, MSR_UNC_PCU_PMON_BOX_FILTER, LLU_CAST event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, event->options[j].value & 0xFFFFFFFFULL));
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_WBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int snb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22);
+    flags |= event->cfgBits;
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            case EVENT_OPTION_MATCH0:
+                if (HPMcheck(filterdev, cpu_id))
                 {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    printf("UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    printf("MATCH UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    printf("MASK UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+                                    event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MATCH0);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+                                    event->options[j].value & 0x8003FFF8ULL));
                 }
-            }
-            else
-            {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
-            }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MATCH1:
+                if (HPMcheck(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+                                    event->options[j].value & 0x000F000FULL, SETUP_SBOX_MATCH1);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+                                    event->options[j].value & 0x000F000FULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MASK0:
+                if (HPMcheck(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+                                    event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MASK0);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+                                    event->options[j].value & 0x8003FFF8ULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MASK1:
+                if (HPMcheck(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+                                    event->options[j].value & 0x000F000FULL, SETUP_SBOX_MASK1);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+                                    event->options[j].value & 0x000F000FULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_SBOX);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,  counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            break;
 
-        case SBOX1:
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
-            {
-                if(haveLock)
+int snb_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_RBOX)
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int snb_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!HPMcheck(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            default:
+                break;
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_PBOX)
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+// Macros to stop counting and reset control registers
+// FREEZE(_AND_RESET_CTL) uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+#define SNB_FREEZE_AND_RESET_CTL_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10101U, FREEZE_AND_RESET_CTL_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10101ULL)); \
+    }
+
+#define SNB_FREEZE_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10100U, FREEZE_AND_RESET_CTL_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10100ULL)); \
+    }
+
+// FREEZE(_AND_RESET_CTL)_PCI uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+// Checks whether PCI device exists, because this is the first operation we do on the devices
+#define SNB_FREEZE_AND_RESET_CTL_PCI_BOX(id) \
+    if (haveLock && \
+        (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+        (HPMcheck(box_map[id].device, cpu_id) == 0)) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL, FREEZE_AND_RESET_CTL_PCI_BOX_##id); \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL)); \
+    }
+
+#define SNB_FREEZE_PCI_BOX(id) \
+    if (haveLock && \
+        (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+        (HPMcheck(box_map[id].device, cpu_id) == 0)) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL, FREEZE_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL)); \
+    }
+
+// MBOX*FIX have a slightly different scheme, setting the whole register to 0 freeze the counter
+#define SNB_FREEZE_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL, FREEZE_MBOXFIX##number) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number,  PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL)); \
+    }
+
+
+
+int perfmon_setupCounterThread_sandybridge(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int i;
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    if (cpuid_info.model == SANDYBRIDGE_EP)
+    {
+        SNB_FREEZE_BOX(CBOX0);
+        SNB_FREEZE_BOX(CBOX1);
+        SNB_FREEZE_BOX(CBOX2);
+        SNB_FREEZE_BOX(CBOX3);
+        SNB_FREEZE_BOX(CBOX4);
+        SNB_FREEZE_BOX(CBOX5);
+        SNB_FREEZE_BOX(CBOX6);
+        SNB_FREEZE_BOX(CBOX7);
+
+        SNB_FREEZE_PCI_BOX(MBOX0);
+        SNB_FREEZE_PCI_BOX(MBOX1);
+        SNB_FREEZE_PCI_BOX(MBOX2);
+        SNB_FREEZE_PCI_BOX(MBOX3);
+
+        SNB_FREEZE_MBOXFIX(0);
+        SNB_FREEZE_MBOXFIX(1);
+        SNB_FREEZE_MBOXFIX(2);
+        SNB_FREEZE_MBOXFIX(3);
+
+        SNB_FREEZE_PCI_BOX(SBOX0);
+        SNB_FREEZE_PCI_BOX(SBOX1);
+
+        SNB_FREEZE_PCI_BOX(RBOX0);
+        SNB_FREEZE_PCI_BOX(RBOX1);
+
+        SNB_FREEZE_PCI_BOX(PBOX);
+
+        SNB_FREEZE_PCI_BOX(BBOX0);
+        SNB_FREEZE_BOX(WBOX);
+    }
+    else
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+    }
+
+    for (i=0;i < eventSet->numberOfEvents;i++)
+    {
+        flags = 0x0ULL;
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                snb_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                /* initialize fixed counters
+                 * FIXED 0: Instructions retired
+                 * FIXED 1: Clocks unhalted core
+                 * FIXED 2: Clocks unhalted ref */
+                fixed_flags |= snb_fixed_setup(cpu_id, index,event);
+                /* Written in the end of function for all fixed purpose registers */
+                break;
+
+            case POWER:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+                snb_mbox_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0FIX:
+                break;
+            case MBOX1FIX:
+                break;
+            case MBOX2FIX:
+                break;
+            case MBOX3FIX:
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+                sandy_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                snb_ubox_setup(cpu_id, index, event);
+                break;
+                
+            case UBOXFIX:
+                if (cpuid_info.model == SANDYBRIDGE_EP)
                 {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST (1ULL<<22), SETUP_UBOXFIX)
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, (1ULL<<22)));
                 }
-            }
-            else
-            {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
-            }
-            break;
+                else
+                {
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST (1ULL<<20)|(1ULL<<22), SETUP_UBOXFIX)
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, (1ULL<<20)|(1ULL<<22)));
+                }
+                break;
+
+            case SBOX0:
+                snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case SBOX1:
+                snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+
+            case SBOX0FIX:
+            case SBOX1FIX:
+                break;
 
-        default:
-            /* should never be reached */
-            break;
+            case BBOX0:
+                snb_bbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                snb_wbox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+            case RBOX1:
+                snb_rbox_setup(cpu_id, index, event);
+                break;
+
+            case PBOX:
+                snb_pbox_setup(cpu_id, index, event);
+                break;
+
+
+            default:
+                break;
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    
+    if (fixed_flags > 0x0)
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-void perfmon_startCountersThread_sandybridge(int thread_id)
+
+// Macros for MSR HPM counters
+// UNFREEZE(_AND_RESET_CTR) uses the central box registers to unfreeze and reset the counter registers
+#define SNB_UNFREEZE_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x0ULL)); \
+    }
+
+#define SNB_UNFREEZE_AND_RESET_CTR_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x2ULL)); \
+    }
+
+// ENABLE(_AND_RESET_CTR) uses the control registers to enable (bit 22) and reset the counter registers (bit 19)
+#define SNB_ENABLE_BOX(id, reg) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp)); \
+        tmp |= (1ULL<<22); \
+        VERBOSEPRINTREG(cpu_id, reg, LLU_CAST tmp, ENABLE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp)); \
+    }
+
+#define SNB_ENABLE_AND_RESET_CTR_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].ctrlRegister, &tmp)); \
+        tmp |= (1ULL<<22)|(1ULL<<17); \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST tmp, ENABLE_AND_RESET_CTR_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, tmp)); \
+    }
+
+// UNFREEZE(_AND_RESET_CTR)_PCI is similar to MSR UNFREEZE but for PCI devices
+#define SNB_UNFREEZE_PCI_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+                && (HPMcheck(box_map[id].device, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x0ULL)); \
+    }
+#define SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+                && (HPMcheck(box_map[id].device, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_AND_RESET_CTR_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x2ULL)); \
+    }
+
+// UNFREEZE(_AND_RESET_CTR)_MBOXFIX is kind of ENABLE for PCI but uses bit 19 for reset
+#define SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+                PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22)|(1ULL<<19), UNFREEZE_AND_RESET_CTR_MBOX##number##FIX) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22)|(1ULL<<19))); \
+    }
+#define SNB_UNFREEZE_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (HPMcheck(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+                PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22), UNFREEZE_MBOXFIX##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number,  PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22))); \
+    }
+
+int perfmon_startCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
+    uint64_t tmp = 0x0ULL;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (sandybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            eventSet->events[i].threadCounter[thread_id].startData = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            PciDeviceIndex dev = counter_map[index].device;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = tmp;
                     }
-
                     break;
 
                 case MBOX0:
-                    if(haveLock)
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                    if (haveLock && HPMcheck(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
                     }
                     break;
 
-                case MBOX1:
-                    if(haveLock)
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                    /*if (haveLock && HPMcheck(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                    }
+                        tmp = 0x0ULL;
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = tmp;
+                    }*/
                     break;
 
-                case MBOX2:
-                    if(haveLock)
+
+                case SBOX0:
+                case SBOX1:
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                    if ((haveLock) && (cpuid_info.model == SANDYBRIDGE))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
                     }
                     break;
 
-                case MBOX3:
-                    if(haveLock)
+                case UBOX:
+                    //SNB_ENABLE_AND_RESET_CTR_BOX(UBOX);
+                    if (haveLock)
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp));
+                        tmp |= (1ULL<<22)|(1ULL<<17);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp));
                     }
                     break;
+                case UBOXFIX:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    SNB_ENABLE_BOX(UBOXFIX, reg);
+                    break;
 
-                case MBOXFIX:
-                    if(haveLock)
+                case BBOX0:
+                    if (haveLock && HPMcheck(dev, cpu_id))
                     {
-                        pci_write(cpu_id, counter_map[i].device,  PCI_UNC_MC_PMON_FIXED_CTL, 0x48000UL);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
                     }
                     break;
 
-                case SBOX0:
-                    if(haveLock)
+                case WBOX:
+                    if (haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, 0x0U));
                     }
                     break;
-
-                case SBOX1:
+                case WBOX0FIX:
+                case WBOX1FIX:
                     if(haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[WBOX0FIX].regWidth);
                     }
                     break;
-
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
         }
     }
 
-    if (perfmon_verbose)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if (cpuid_info.model == SANDYBRIDGE_EP)
+    {
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX0);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX1);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX2);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX3);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX4);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX5);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX6);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX7);
+        SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX0);
+        SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX1);
+        SNB_UNFREEZE_PCI_BOX(MBOX0);
+        SNB_UNFREEZE_PCI_BOX(MBOX1);
+        SNB_UNFREEZE_PCI_BOX(MBOX2);
+        SNB_UNFREEZE_PCI_BOX(MBOX3);
+        SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(0);
+        SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(1);
+        SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(2);
+        SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(3);
+        SNB_UNFREEZE_PCI_BOX(BBOX0);
+        SNB_UNFREEZE_AND_RESET_CTR_BOX(WBOX);
+        SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX0);
+        SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX1);
+        SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(PBOX);
+    }
+    else
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, (1ULL<<29)));
+    }
+    return 0;
 }
 
-void perfmon_stopCountersThread_sandybridge(int thread_id)
+// Read MSR counter register
+#define SNB_READ_BOX(id, reg1) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+    { \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg1, &counter_result)); \
+        VERBOSEPRINTREG(cpu_id, reg1, LLU_CAST counter_result, READ_BOX_##id) \
+    }
+
+// Read PCI counter registers and combine them to a single value
+#define SNB_READ_PCI_BOX(id, dev, reg1, reg2) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && HPMcheck(dev, cpu_id)) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg1, &tmp)); \
+        counter_result = (tmp<<32); \
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg2, &tmp)); \
+        counter_result += tmp; \
+        VERBOSEPRINTPCIREG(cpu_id, dev, reg1, LLU_CAST counter_result, READ_PCI_BOX_##id) \
+    }
+
+// Check counter result for overflows. We do not handle overflows directly, that is done in the getResults function in perfmon.c
+// SandyBridge has no bits indicating that overflows occured, therefore we use this simple check
+#define SNB_CHECK_OVERFLOW \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        eventSet->events[i].threadCounter[thread_id].overflows++; \
+    }
+
+
+int perfmon_stopCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    if (cpuid_info.model == SANDYBRIDGE_EP)
+    {
+        SNB_FREEZE_BOX(CBOX0);
+        SNB_FREEZE_BOX(CBOX1);
+        SNB_FREEZE_BOX(CBOX2);
+        SNB_FREEZE_BOX(CBOX3);
+        SNB_FREEZE_BOX(CBOX4);
+        SNB_FREEZE_BOX(CBOX5);
+        SNB_FREEZE_BOX(CBOX6);
+        SNB_FREEZE_BOX(CBOX7);
+
+        SNB_FREEZE_PCI_BOX(MBOX0);
+        SNB_FREEZE_PCI_BOX(MBOX1);
+        SNB_FREEZE_PCI_BOX(MBOX2);
+        SNB_FREEZE_PCI_BOX(MBOX3);
+
+        SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX0);
+        SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX1);
+
+        SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX0);
+        SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX1);
+
+        SNB_FREEZE_AND_RESET_CTL_PCI_BOX(PBOX);
+
+        SNB_FREEZE_PCI_BOX(BBOX0);
+        SNB_FREEZE_AND_RESET_CTL_BOX(WBOX);
+    }
+    else
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+    }
 
-    for ( int i=0; i < perfmon_numCountersSandybridge; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (sandybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            switch (type)
             {
                 case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                        (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_PMC);
+                    break;
 
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index+32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+                        }
+                    }
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_FIXED);
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, sandybridge_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_POWER);
+                        SNB_CHECK_OVERFLOW;
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
                 case MBOX0:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX1:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX2:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX3:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                    SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                sandybridge_counter_map[i].counterRegister);
+                case MBOX0FIX:
+                    SNB_READ_PCI_BOX(MBOX0FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX1FIX:
+                    SNB_READ_PCI_BOX(MBOX1FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX2FIX:
+                    SNB_READ_PCI_BOX(MBOX2FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX3FIX:
+                    SNB_READ_PCI_BOX(MBOX3FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case SBOX0:
+                    SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case SBOX1:
+                    SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
-                case MBOXFIX:
-                    if(haveLock)
+                case SBOX0FIX:
+                case SBOX1FIX:
+                    if (haveLock && HPMcheck(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        if (eventSet->events[i].event.eventId == 0x00)
+                        {
+                            switch(extractBitField(counter_result, 3, 0))
+                            {
+                                case 0x2:
+                                    counter_result = 5.6E9;
+                                    break;
+                                case 0x3:
+                                    counter_result = 6.4E9;
+                                    break;
+                                case 0x4:
+                                    counter_result = 7.2E9;
+                                    break;
+                                case 0x5:
+                                    counter_result = 8.0E9;
+                                    break;
+                                case 0x6:
+                                    counter_result = 8.8E9;
+                                    break;
+                                case 0x7:
+                                    counter_result = 9.6E9;
+                                    break;
+                                default:
+                                    counter_result = 0;
+                                    break;
+                            }
+                        }
+                        else if (eventSet->events[i].event.eventId == 0x01)
+                        {
+                            counter_result = extractBitField(counter_result, 1, 4);
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_SBOXFIX);
                     }
                     break;
 
-                case SBOX0:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case CBOX0:
+                    SNB_READ_BOX(CBOX0, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX1:
+                    SNB_READ_BOX(CBOX1, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX2:
+                    SNB_READ_BOX(CBOX2, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX3:
+                    SNB_READ_BOX(CBOX3, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX4:
+                    SNB_READ_BOX(CBOX4, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX5:
+                    SNB_READ_BOX(CBOX5, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX6:
+                    SNB_READ_BOX(CBOX6, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX7:
+                    SNB_READ_BOX(CBOX7, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case UBOX:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_UBOX);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case UBOXFIX:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_UBOXFIX);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
-                case SBOX1:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
-                                sandybridge_counter_map[i].counterRegister);
+                case BBOX0:
+                    SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case WBOX:
+                    SNB_READ_BOX(WBOX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX0FIX:
+                    SNB_READ_BOX(WBOX0FIX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX1FIX:
+                    SNB_READ_BOX(WBOX1FIX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case RBOX0:
+                    SNB_READ_PCI_BOX(RBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case RBOX1:
+                    SNB_READ_PCI_BOX(RBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
+                case PBOX:
+                    SNB_READ_PCI_BOX(PBOX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData =
+                    field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
-    {
-        printf ("Overflow occured \n");
-    }
+    return 0;
 }
 
-void perfmon_readCountersThread_sandybridge(int thread_id)
+int perfmon_readCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t pmc_flags = 0x0ULL;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    if (cpuid_info.model == SANDYBRIDGE_EP)
+    {
+        SNB_FREEZE_BOX(CBOX0);
+        SNB_FREEZE_BOX(CBOX1);
+        SNB_FREEZE_BOX(CBOX2);
+        SNB_FREEZE_BOX(CBOX3);
+        SNB_FREEZE_BOX(CBOX4);
+        SNB_FREEZE_BOX(CBOX5);
+        SNB_FREEZE_BOX(CBOX6);
+        SNB_FREEZE_BOX(CBOX7);
+
+        SNB_FREEZE_PCI_BOX(MBOX0);
+        SNB_FREEZE_PCI_BOX(MBOX1);
+        SNB_FREEZE_PCI_BOX(MBOX2);
+        SNB_FREEZE_PCI_BOX(MBOX3);
+
+        SNB_FREEZE_MBOXFIX(0);
+        SNB_FREEZE_MBOXFIX(1);
+        SNB_FREEZE_MBOXFIX(2);
+        SNB_FREEZE_MBOXFIX(3);
+
+        SNB_FREEZE_PCI_BOX(SBOX0);
+        SNB_FREEZE_PCI_BOX(SBOX1);
+
+        SNB_FREEZE_PCI_BOX(RBOX0);
+        SNB_FREEZE_PCI_BOX(RBOX1);
+
+        SNB_FREEZE_PCI_BOX(PBOX);
+
+        SNB_FREEZE_PCI_BOX(BBOX0);
+        SNB_FREEZE_BOX(WBOX);
+    }
+    else
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<31), FREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<31)));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((sandybridge_counter_map[i].type == PMC) ||
-                    (sandybridge_counter_map[i].type == FIXED))
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    switch (sandybridge_counter_map[i].type)
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_PMC);
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
-                            break;
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                        (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
 
-                        case MBOX0:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister);
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_FIXED);
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index+32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+                        }
+                    }
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case THERMAL:
+                    CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+                    break;
+
+                case POWER:
+                    if (haveLock)
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_POWER);
+                        SNB_CHECK_OVERFLOW;
+                    }
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case MBOX0:
+                    SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        case MBOX1:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    sandybridge_counter_map[i].counterRegister);
+                case MBOX1:
+                    SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case MBOX2:
+                    SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case MBOX3:
+                    SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        case MBOX2:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    sandybridge_counter_map[i].counterRegister);
+                case MBOX0FIX:
+                    SNB_READ_PCI_BOX(MBOX0FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX1FIX:
+                    SNB_READ_PCI_BOX(MBOX1FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX2FIX:
+                    SNB_READ_PCI_BOX(MBOX2FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX3FIX:
+                    SNB_READ_PCI_BOX(MBOX3FIX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case UBOX:
+                case UBOXFIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, STOP_UBOX);
+                        SNB_CHECK_OVERFLOW;
+                    }
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case CBOX0:
+                    SNB_READ_BOX(CBOX0, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX1:
+                    SNB_READ_BOX(CBOX1, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX2:
+                    SNB_READ_BOX(CBOX2, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX3:
+                    SNB_READ_BOX(CBOX3, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX4:
+                    SNB_READ_BOX(CBOX4, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX5:
+                    SNB_READ_BOX(CBOX5, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX6:
+                    SNB_READ_BOX(CBOX6, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX7:
+                    SNB_READ_BOX(CBOX7, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        case MBOX3:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    sandybridge_counter_map[i].counterRegister);
+                case BBOX0:
+                    SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case SBOX0:
+                    SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case SBOX1:
+                    SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        default:
-                            /* should never be reached */
-                            break;
+                case SBOX0FIX:
+                case SBOX1FIX:
+                    
+                    HPMread(cpu_id, dev, counter1, &counter_result);
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
                     }
-                }
+                    else if (eventSet->events[i].event.eventId == 0x01)
+                    {
+                        counter_result = extractBitField(counter_result, 1, 4);
+                    }
+                    eventSet->events[i].threadCounter[thread_id].startData = 0x0ULL;
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_SBOXFIX);
+                    break;
+
+                case WBOX:
+                    SNB_READ_BOX(WBOX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX0FIX:
+                    SNB_READ_BOX(WBOX0FIX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX1FIX:
+                    SNB_READ_BOX(WBOX1FIX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+
+                case RBOX0:
+                    SNB_READ_PCI_BOX(RBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case RBOX1:
+                    SNB_READ_PCI_BOX(RBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+
+                case PBOX:
+                    SNB_READ_PCI_BOX(PBOX, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData =
+                    field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    if (cpuid_info.model == SANDYBRIDGE_EP)
+    {
+        SNB_UNFREEZE_BOX(CBOX0);
+        SNB_UNFREEZE_BOX(CBOX1);
+        SNB_UNFREEZE_BOX(CBOX2);
+        SNB_UNFREEZE_BOX(CBOX3);
+        SNB_UNFREEZE_BOX(CBOX4);
+        SNB_UNFREEZE_BOX(CBOX5);
+        SNB_UNFREEZE_BOX(CBOX6);
+        SNB_UNFREEZE_BOX(CBOX7);
+
+        SNB_UNFREEZE_PCI_BOX(MBOX0);
+        SNB_UNFREEZE_PCI_BOX(MBOX1);
+        SNB_UNFREEZE_PCI_BOX(MBOX2);
+        SNB_UNFREEZE_PCI_BOX(MBOX3);
+
+        SNB_UNFREEZE_MBOXFIX(0);
+        SNB_UNFREEZE_MBOXFIX(1);
+        SNB_UNFREEZE_MBOXFIX(2);
+        SNB_UNFREEZE_MBOXFIX(3);
+
+        SNB_UNFREEZE_PCI_BOX(SBOX0);
+        SNB_UNFREEZE_PCI_BOX(SBOX1);
+
+        SNB_UNFREEZE_PCI_BOX(RBOX0);
+        SNB_UNFREEZE_PCI_BOX(RBOX1);
+
+        SNB_UNFREEZE_PCI_BOX(PBOX);
+
+        SNB_UNFREEZE_PCI_BOX(BBOX0);
+        SNB_UNFREEZE_BOX(WBOX);
+    }
+    else
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29)));
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+
+    return 0;
 }
 
+int perfmon_finalizeCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            switch(type)
+            {
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
+            }
+            if ((reg) &&
+                (((type == PMC)||(type == FIXED)) || ((type >= UNCORE) && (haveLock) && (HPMcheck(dev, cpu_id)))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        }
+    }
+
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_sandybridgeEP_counters.h b/src/includes/perfmon_sandybridgeEP_counters.h
new file mode 100644
index 0000000..befef53
--- /dev/null
+++ b/src/includes/perfmon_sandybridgeEP_counters.h
@@ -0,0 +1,214 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_sandybridgeEP_counters.h
+ *
+ *      Description: Counter header file of perfmon module for Intel Sandy Bridge EP.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_SANDYBRIDGEEP 97
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGEEP 53
+#define NUM_COUNTERS_CORE_SANDYBRIDGEEP 8
+
+#define SNBEP_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SNBEP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK| \
+                            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SNBEP_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_TID_MASK| \
+                            EVENT_OPTION_INVERT_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+#define SNBEP_VALID_OPTIONS_WBOX  EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK| \
+                            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK| \
+                            EVENT_OPTION_OCCUPANCY_INVERT_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNBEP_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_BBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+                            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNBEP_VALID_OPTIONS_MBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK| \
+                            EVENT_OPTION_MASK1_MASK
+#define SNBEP_VALID_OPTIONS_RBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNBEP_VALID_OPTIONS_PBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap sandybridgeEP_counter_map[NUM_COUNTERS_SANDYBRIDGEEP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SNBEP_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SNBEP_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* CBOX counters */
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_CBOX},
+    /* UBOX counters */
+    {"UBOX0", PMC44, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC45, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC46, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX0",PMC47, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC48, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC49, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC50, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, SNBEP_VALID_OPTIONS_WBOX},
+    {"WBOXFIX0", PMC51, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOXFIX1", PMC52, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
+    {"MBOX0C0",PMC53, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC54, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC55, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC56, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC57, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX1C0",PMC58, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC59, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC60, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC61, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX", PMC62, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX2C0",PMC63, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C1",PMC64, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C2",PMC65, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2C3",PMC66, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX", PMC67, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX3C0",PMC68, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C1",PMC69, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C2",PMC70, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3C3",PMC71, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, SNBEP_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX", PMC72, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+    /* QPI counters four 48bit  wide per port, split in two reads */
+    {"SBOX0C0",PMC73, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC74, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC75, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC76, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX0FIX", PMC77, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX1C0",PMC78, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC79, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC80, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC81, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, SNBEP_VALID_OPTIONS_SBOX},
+    {"SBOX1FIX", PMC82, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    /* BBOX or better known as Home Agent (HA) */
+    {"BBOX0",PMC83, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+    {"BBOX1",PMC84, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+    {"BBOX2",PMC85, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+    {"BBOX3",PMC86, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, SNBEP_VALID_OPTIONS_BBOX},
+    {"RBOX0C0", PMC87, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC88, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC89, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, SNBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC90, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC91, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC92, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, SNBEP_VALID_OPTIONS_RBOX},
+    {"PBOX0", PMC93, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC94, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC95, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC96, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, SNBEP_VALID_OPTIONS_PBOX},
+};
+
+static BoxMap sandybridgeEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 48},
+    [WBOX0FIX] = {0, 0, 0, 0, 0, MSR_DEV, 48},
+    [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, MSR_DEV, 44},
+    [UBOXFIX] = {0, 0, 0, 0, 0, MSR_DEV, 44},
+    [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C0_PMON_BOX_FILTER},
+    [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C1_PMON_BOX_FILTER},
+    [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C2_PMON_BOX_FILTER},
+    [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C3_PMON_BOX_FILTER},
+    [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C4_PMON_BOX_FILTER},
+    [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C5_PMON_BOX_FILTER},
+    [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C6_PMON_BOX_FILTER},
+    [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C7_PMON_BOX_FILTER},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX0FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, 0, 0, 0, 1, PCI_HA_DEVICE_0, 48},
+    [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R2PCIE_DEVICE, 44},
+};
+
+static PciDevice sandybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NODEVTYPE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x3c44},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x3c45},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x3c43},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.0", "PCI_IMC_DEVICE_CH_0", "MBOX0", 0x3cb0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.1", "PCI_IMC_DEVICE_CH_1", "MBOX1", 0x3cb1},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.4", "PCI_IMC_DEVICE_CH_2", "MBOX2", 0x3cb4},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.5", "PCI_IMC_DEVICE_CH_3", "MBOX3", 0x3cb5},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE", "BBOX", 0x3c46},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x3c41},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x3c42},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x3c86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x3c96},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "SBOX0FIX",0x3c80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "09.0", "PCI_QPI_MISC_DEVICE_PORT_1", "SBOX1FIX", 0x3c91},
+};
diff --git a/src/includes/perfmon_sandybridgeEP_events.txt b/src/includes/perfmon_sandybridgeEP_events.txt
new file mode 100644
index 0000000..63198a9
--- /dev/null
+++ b/src/includes/perfmon_sandybridgeEP_events.txt
@@ -0,0 +1,1342 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_sandybridgeEP_events.txt
+#
+#      Description:  Event list for Intel SandyBridge EP
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE               0x00   TMP0
+UMASK_TEMP_CORE               0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY         0x00   PWR3
+UMASK_PWR_DRAM_ENERGY         0x00
+
+EVENT_INSTR_RETIRED           0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY       0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE   0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF    0x00
+
+EVENT_LOAD_BLOCKS                 0x03  PMC
+UMASK_LOAD_BLOCKS_DATA_UNKNOWN    0x01
+UMASK_LOAD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LOAD_BLOCKS_NO_SR           0x08
+UMASK_LOAD_BLOCKS_ALL_BLOCK       0x10
+
+EVENT_MISALIGN_MEM_REF           0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOAD      0x01
+UMASK_MISALIGN_MEM_REF_STORE     0x02
+UMASK_MISALIGN_MEM_REF_ANY       0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK   0x08
+
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x04
+
+EVENT_INT_MISC                       0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES       0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT        0x03
+UMASK_INT_MISC_RAT_STALL_CYCLES      0x40
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RAT_STALL_COUNT       0x40
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES       0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ACTIVE_CYCLES  0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_FP_COMP_OPS_EXE                          0x10   PMC
+UMASK_FP_COMP_OPS_EXE_X87                      0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
+
+EVENT_SIMD_FP_256_PACKED            0x11   PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
+
+EVENT_ARITH                      0x14   PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV              0x01
+
+EVENT_INSTS_WRITTEN_TO_IQ            0x17   PMC
+UMASK_INSTS_WRITTEN_TO_IQ_INSTS      0x01
+
+EVENT_L2_RQSTS                          0x24   PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT  0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD      0x03
+UMASK_L2_RQSTS_RFO_HITS                 0x04
+UMASK_L2_RQSTS_RFO_MISS                 0x08
+UMASK_L2_RQSTS_RFO_ANY                  0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS             0x10
+UMASK_L2_RQSTS_CODE_RD_MISS             0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD         0x30
+UMASK_L2_RQSTS_PF_HIT                   0x40
+UMASK_L2_RQSTS_PF_MISS                  0x80
+UMASK_L2_RQSTS_ALL_PF                   0xC0
+UMASK_L2_RQSTS_MISS                     0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_E      0x04
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M      0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
+
+EVENT_L1D_WB_RQST                  0x28   PMC
+UMASK_L1D_WB_RQST_HIT_E            0x04
+UMASK_L1D_WB_RQST_HIT_M            0x08
+
+EVENT_L3_LAT_CACHE               0x2E   PMC
+UMASK_L3_LAT_CACHE_REFERENCE     0x4F
+UMASK_L3_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
+
+EVENT_L1D_PEND_MISS              0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING      0x01
+
+EVENT_DTLB_STORE_MISSES                 0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED  0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION   0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT        0x10
+
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_HW_PRE_REQ               0x4E    PMC
+UMASK_HW_PRE_REQ_DL1_MISS      0x02
+
+EVENT_L1D                         0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_PARTIAL_RAT_STALLS                   0x59    PMC
+UMASK_PARTIAL_RAT_STALLS_FLAGS_MERGE_UOP   0x20
+UMASK_PARTIAL_RAT_STALLS_SLOW_LEA_WINDOW   0x40
+UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP    0x80
+
+EVENT_RESOURCE_STALLS2                  0x5B    PMC
+UMASK_RESOURCE_STALLS2_ALL_FL_EMPTY     0x0C
+UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL  0x0F
+UMASK_RESOURCE_STALLS2_BOB_FULL         0x40
+UMASK_RESOURCE_STALLS2_OOO_RSRC         0x4F
+
+EVENT_CPL_CYCLES               0x5C    PMC
+UMASK_CPL_CYCLES_RING0         0x01
+UMASK_CPL_CYCLES_RING123       0x02
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_CACHE_LOCK_CYCLES                             0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
+
+EVENT_IDQ               0x79   PMC
+UMASK_IDQ_EMPTY         0x02
+UMASK_IDQ_MITE_UOPS     0x04
+UMASK_IDQ_DSB_UOPS      0x08
+UMASK_IDQ_MS_DSB_UOPS   0x10
+UMASK_IDQ_MS_MITE_UOPS  0x20
+UMASK_IDQ_MS_UOPS       0x30
+
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HITS               0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                 0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
+UMASK_ITLB_MISSES_WALK_DURATION   0x04
+UMASK_ITLB_MISSES_STLB_HIT        0x10
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC                                     0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                          0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                      0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                    0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN     0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                   0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN               0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN              0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN          0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN            0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN        0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES                        0xFF
+
+EVENT_BR_MISP_EXEC                                     0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                          0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                      0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN     0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                   0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN               0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN              0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN          0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN            0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN        0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                        0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+
+EVENT_UOPS_DISPATCHED_PORT                  0xA1   PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD        0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA       0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS       0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS        0xFF
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_LB              0x02
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_B               0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+UMASK_RESOURCE_STALLS_FCSW            0x20
+UMASK_RESOURCE_STALLS_MXCSR           0x40
+UMASK_RESOURCE_STALLS_OTHER           0x80
+
+EVENT_DSB2MITE_SWITCHES                  0xAB   PMC
+UMASK_DSB2MITE_SWITCHES_COUNT            0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES   0x02
+
+EVENT_DSB_FILL                         0xAC   PMC
+UMASK_DSB_FILL_OTHER_CANCEL            0x02
+UMASK_DSB_FILL_EXCEED_DSB_LINES        0x08
+UMASK_DSB_FILL_ALL_CANCEL              0x0A
+
+EVENT_ITLB                         0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH              0x01
+
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER             0xB2  PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL     0x01
+
+EVENT_AGU_BYPASS_CANCEL           0xB6  PMC
+UMASK_AGU_BYPASS_CANCEL_COUNT     0x01
+
+EVENT_TLB_FLUSH                 0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_L1D_BLOCKS                         0xBF  PMC
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES    0x05
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_COUNT     0x05
+
+EVENT_INST_RETIRED                        0xC0  PMC0
+UMASK_INST_RETIRED_ANY_P                  0x00
+UMASK_INST_RETIRED_PREC_DIST              0x01
+
+EVENT_OTHER_ASSISTS                       0xC1  PMC
+UMASK_OTHER_ASSISTS_ITLB_MISS_RETIRED     0x02
+UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
+
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL   0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
+UMASK_BR_MISP_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_MISP_RETIRED_TAKEN         0x20
+
+EVENT_FP_ASSIST               0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT    0x02
+UMASK_FP_ASSIST_X87_INPUT     0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT   0x08
+UMASK_FP_ASSIST_SIMD_INPUT    0x10
+UMASK_FP_ASSIST_ANY           0x1E
+
+EVENT_HW_INTERRUPTS_RECEIVED               0xCB  PMC
+UMASK_HW_INTERRUPTS_RECEIVED               0x01
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS            0x81
+UMASK_MEM_UOPS_RETIRED_STORES           0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK      0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT        0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS       0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL        0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT        0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS       0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL        0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT        0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS       0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL        0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB       0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL           0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED                   0xD2   PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_MEM_LOAD_UOPS_MISC_RETIRED               0xD4   PMC
+UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS      0x02
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PREF       0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
+UMASK_L2_LINES_OUT_PF_CLEAN         0x04
+UMASK_L2_LINES_OUT_PF_DIRTY         0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL        0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL        0x05
+UMASK_L2_LINES_OUT_ALL              0x0F
+
+EVENT_SQ_MISC                         0xF4  PMC
+UMASK_SQ_MISC_SPLIT_LOCK              0x10
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS                         0x00  CBOX
+UMASK_CBOX_CLOCKTICKS                         0x00
+
+EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX0C1|CBOX0C2|CBOX0C3|CBOX1C1|CBOX1C2|CBOX1C3|CBOX2C1|CBOX2C2|CBOX2C3|CBOX3C1|CBOX03C2|CBOX3C3|CBOX4C1|CBOX4C2|CBOX4C3|CBOX5C1|CBOX5C2|CBOX5C3|CBOX6C1|CBOX6C2|CBOX6C3|CBOX7C1|CBOX7C2|CBOX7C3
+UMASK_COUNTER0_OCCUPANCY              0x00
+
+EVENT_ISMQ_DRD_MISS_OCC              0x21  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_ISMQ_DRD_MISS_OCC              0x00
+
+EVENT_LLC_LOOKUP              0x34  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+DEFAULT_OPTIONS_LLC_LOOKUP_ANY       EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_ANY                0x11
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+DEFAULT_OPTIONS_LLC_LOOKUP_WRITE    EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_WRITE              0x05
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ_AND_ALL EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ_AND_ALL WRITE 0x7
+DEFAULT_OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+DEFAULT_OPTIONS_LLC_LOOKUP_NID      EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_LLC_VICTIMS              0x37  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_LLC_VICTIMS_M_STATE      0x01
+UMASK_LLC_VICTIMS_E_STATE      0x02
+UMASK_LLC_VICTIMS_S_STATE      0x04
+UMASK_LLC_VICTIMS_MISS         0x08
+UMASK_LLC_VICTIMS_ALL_STATES   0x0F
+OPTIONS_LLC_VICTIMS_NID        EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID          0x40
+OPTIONS_LLC_VICTIMS_NID_MISSES EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID_MISSES   0x41
+
+EVENT_CBOX_MISC              0x39  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_CBOX_MISC_RSPI_WAS_FSE      0x01
+UMASK_CBOX_MISC_WC_ALIASING       0x02
+UMASK_CBOX_MISC_STARTED           0x04
+UMASK_CBOX_MISC_RFO_HIT_S         0x08
+
+EVENT_RING_AD_USED              0x1B  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AD_USED_UP_EVEN      0x01
+UMASK_RING_AD_USED_UP_ODD       0x02
+UMASK_RING_AD_USED_DOWN_EVEN    0x04
+UMASK_RING_AD_USED_DOWN_ODD     0x08
+
+EVENT_RING_AK_USED              0x1C  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AK_USED_UP_EVEN      0x01
+UMASK_RING_AK_USED_UP_ODD       0x02
+UMASK_RING_AK_USED_DOWN_EVEN    0x04
+UMASK_RING_AK_USED_DOWN_ODD     0x08
+
+EVENT_RING_BL_USED              0x1D  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_BL_USED_UP_EVEN      0x01
+UMASK_RING_BL_USED_UP_ODD       0x02
+UMASK_RING_BL_USED_DOWN_EVEN    0x04
+UMASK_RING_BL_USED_DOWN_ODD     0x08
+
+EVENT_RING_BOUNCES              0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_BOUNCES_AK_CORE      0x02
+UMASK_RING_BOUNCES_BL_CORE      0x04
+UMASK_RING_BOUNCES_IV_CORE      0x08
+
+EVENT_RING_IV_USED              0x1E  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_IV_USED_ANY          0x0F
+
+EVENT_RING_SRC_THRTL            0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_SRC_THRTL            0x07
+
+EVENT_RXR_EXT_STARVED               0x12  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_ISMQ          0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                0x13  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_INSERTS_IRQ            0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
+UMASK_RXR_INSERTS_IPQ            0x04
+UMASK_RXR_INSERTS_VFIFO          0x10
+
+EVENT_RXR_IPQ_RETRY                0x31  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IPQ_RETRY_ANY            0x01
+UMASK_RXR_IPQ_RETRY_FULL           0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_IRQ_RETRY                0x32  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IRQ_RETRY_ANY            0x01
+UMASK_RXR_IRQ_RETRY_FULL           0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IRQ_RETRY_RTID           0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_ISMQ_RETRY                0x33  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_OCCUPANCY                0x11  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+UMASK_RXR_OCCUPANCY_IRQ            0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
+UMASK_RXR_OCCUPANCY_IPQ            0x04
+UMASK_RXR_OCCUPANCY_VIFO           0x10
+
+EVENT_TOR_INSERTS                    0x35  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+OPTIONS_TOR_INSERTS_OPCODE           EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE             0x01
+UMASK_TOR_INSERTS_EVICTION           0x04
+UMASK_TOR_INSERTS_WB                 0x10
+OPTIONS_TOR_INSERTS_MISS_OPCODE      EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE        0x03
+UMASK_TOR_INSERTS_MISS_ALL           0x0A
+OPTIONS_TOR_INSERTS_NID_OPCODE       EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE         0x41
+OPTIONS_TOR_INSERTS_NID_EVICION      EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICTION       0x44
+OPTIONS_TOR_INSERTS_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL            0x48
+OPTIONS_TOR_INSERTS_NID_WB           EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB             0x50
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE  EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
+OPTIONS_TOR_INSERTS_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
+
+EVENT_TOR_OCCUPANCY                    0x36  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+OPTIONS_TOR_OCCUPANCY_OPCODE           EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE             0x01
+UMASK_TOR_OCCUPANCY_EVICTION           0x04
+UMASK_TOR_OCCUPANCY_ALL                0x08
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE      EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
+UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE       EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION     EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL            0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE  EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL     EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
+
+EVENT_TXT_ADS_USED                0x04  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_ADS_USED            0x00
+
+EVENT_TXT_INSERTS                0x02  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_INSERTS_AD_CACHE            0x01
+UMASK_TXT_INSERTS_AK_CACHE            0x02
+UMASK_TXT_INSERTS_BL_CACHE            0x04
+UMASK_TXT_INSERTS_IV_CACHE            0x08
+UMASK_TXT_INSERTS_AD_CORE             0x10
+UMASK_TXT_INSERTS_AK_CORE             0x20
+UMASK_TXT_INSERTS_BL_CORE             0x40
+
+EVENT_BBOX_CLOCKTICKS                0x00  BBOX
+UMASK_BBOX_CLOCKTICKS                0x00
+
+EVENT_CONFLICT_CYCLES                0x0B  BBOX
+UMASK_CONFLICT_CYCLES_NO_CONFLICT    0x01
+UMASK_CONFLICT_CYCLES_CONFLICT       0x02
+
+EVENT_DIRECT2CORE_COUNT                0x11  BBOX
+UMASK_DIRECT2CORE_COUNT                0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED      0x12  BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED      0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE         0x13  BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE         0x00
+
+EVENT_DIRECTORY_LOOKUP             0x0C  BBOX
+UMASK_DIRECTORY_LOOKUP_SNP         0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP      0x02
+
+EVENT_DIRECTORY_UPDATE             0x0D  BBOX
+UMASK_DIRECTORY_UPDATE_SET         0x01
+UMASK_DIRECTORY_UPDATE_CLEAR       0x02
+UMASK_DIRECTORY_UPDATE_ANY         0x03
+
+EVENT_IGR_NO_CREDIT_CYCLES             0x22  BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0     0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1     0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0     0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1     0x08
+
+EVENT_IMC_RETRY     0x1E  BBOX
+UMASK_IMC_RETRY     0x00
+
+EVENT_IMC_WRITES                   0x1A  BBOX
+UMASK_IMC_WRITES_FULL              0x01
+UMASK_IMC_WRITES_PARTIAL           0x02
+UMASK_IMC_WRITES_FULL_ISOCH        0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH     0x08
+UMASK_IMC_WRITES_ALL               0x0F
+
+EVENT_REQUESTS                   0x01  BBOX
+UMASK_REQUESTS_READS             0x03
+UMASK_REQUESTS_WRITES            0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS           0x15  BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0      0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1      0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2      0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3      0x08
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_ALL       0x0F
+
+EVENT_TAD_REQUESTS_G0               0x1B  BBOX
+UMASK_TAD_REQUESTS_G0_REGION_0      0x01
+UMASK_TAD_REQUESTS_G0_REGION_1      0x02
+UMASK_TAD_REQUESTS_G0_REGION_2      0x04
+UMASK_TAD_REQUESTS_G0_REGION_3      0x08
+UMASK_TAD_REQUESTS_G0_REGION_4      0x10
+UMASK_TAD_REQUESTS_G0_REGION_5      0x20
+UMASK_TAD_REQUESTS_G0_REGION_6      0x40
+UMASK_TAD_REQUESTS_G0_REGION_7      0x80
+
+EVENT_TAD_REQUESTS_G1               0x1C  BBOX
+UMASK_TAD_REQUESTS_G1_REGION_8      0x01
+UMASK_TAD_REQUESTS_G1_REGION_9      0x02
+UMASK_TAD_REQUESTS_G1_REGION_10      0x04
+UMASK_TAD_REQUESTS_G1_REGION_11      0x08
+
+EVENT_TRACKER_INSERTS                   0x06  BBOX
+UMASK_TRACKER_INSERTS_ALL             0x03
+
+EVENT_TXR_AD                   0x0F  BBOX
+UMASK_TXR_AD_NDR             0x01
+UMASK_TXR_AD_SNP             0x02
+
+EVENT_TXR_AD_CYCLES_FULL                  0x2A  BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL              0x03
+
+EVENT_TXR_AK_CYCLES_FULL                  0x32  BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL              0x03
+
+EVENT_TXR_AK_NDR              0x0E  BBOX
+UMASK_TXR_AK_NDR              0x00
+
+EVENT_TXR_BL              0x10  BBOX
+UMASK_TXR_BL_DRS_CACHE    0x01
+UMASK_TXR_BL_DRS_CORE     0x02
+UMASK_TXR_BL_DRS_QPI      0x04
+
+EVENT_TXR_BL_CYCLES_FULL                  0x36  BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL              0x03
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS                0x18  BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0           0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1           0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2           0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3           0x08
+
+EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS             0x00
+
+EVENT_ACT_COUNT                  0x01  MBOX
+UMASK_ACT_COUNT                  0x00
+
+EVENT_CAS_COUNT                  0x04  MBOX
+UMASK_CAS_COUNT_RD_REG           0x01
+UMASK_CAS_COUNT_RD_UNDERFILL     0x02
+UMASK_CAS_COUNT_RD               0x03
+UMASK_CAS_COUNT_WR_WMM           0x04
+UMASK_CAS_COUNT_WR_RMM           0x08
+UMASK_CAS_COUNT_WR               0x0C
+UMASK_CAS_COUNT_ALL              0x0F
+
+EVENT_DRAM_PRE_ALL                  0x06  MBOX
+UMASK_DRAM_PRE_ALL                  0x00
+
+EVENT_DRAM_REFRESH                  0x05  MBOX
+UMASK_DRAM_REFRESH_PANIC            0x02
+UMASK_DRAM_REFRESH_HIGH             0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
+UMASK_ECC_CORRECTABLE_ERRORS           0x00
+
+EVENT_MAJOR_MODES                  0x07  MBOX
+UMASK_MAJOR_MODES_READ             0x01
+UMASK_MAJOR_MODES_WRITE            0x02
+UMASK_MAJOR_MODES_PARTIAL          0x04
+UMASK_MAJOR_MODES_ISOCH            0x08
+
+EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
+UMASK_POWER_CHANNEL_DLLOFF           0x00
+
+EVENT_POWER_CHANNEL_PPD           0x85  MBOX
+UMASK_POWER_CHANNEL_PPD           0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83  MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
+
+EVENT_POWER_SELF_REFRESH           0x43  MBOX
+UMASK_POWER_SELF_REFRESH           0x00
+
+EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
+
+EVENT_PREEMPTION           0x08  MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
+
+EVENT_PRE_COUNT           0x02  MBOX
+UMASK_PRE_COUNT_PAGE_MISS           0x01
+UMASK_PRE_COUNT_PAGE_CLOSE           0x02
+
+EVENT_RPQ_CYCLES_FULL           0x12  MBOX
+UMASK_RPQ_CYCLES_FULL           0x00
+
+EVENT_RPQ_CYCLES_NE           0x11  MBOX
+UMASK_RPQ_CYCLES_NE           0x00
+
+EVENT_RPQ_INSERTS           0x10  MBOX
+UMASK_RPQ_INSERTS           0x00
+
+EVENT_RPQ_OCCUPANCY           0x80  MBOX
+UMASK_RPQ_OCCUPANCY           0x00
+
+EVENT_WPQ_CYCLES_FULL           0x22  MBOX
+UMASK_WPQ_CYCLES_FULL           0x00
+
+EVENT_WPQ_CYCLES_NE           0x21  MBOX
+UMASK_WPQ_CYCLES_NE           0x00
+
+EVENT_WPQ_INSERTS           0x20  MBOX
+UMASK_WPQ_INSERTS           0x00
+
+EVENT_WPQ_OCCUPANCY           0x81  MBOX
+UMASK_WPQ_OCCUPANCY           0x00
+
+EVENT_WPQ_READ_HIT           0x23  MBOX
+UMASK_WPQ_READ_HIT           0x00
+
+EVENT_WPQ_WRITE_HIT           0x24  MBOX
+UMASK_WPQ_WRITE_HIT           0x00
+
+EVENT_WBOX_CLOCKTICKS           0x00  WBOX
+UMASK_WBOX_CLOCKTICKS           0x00
+
+EVENT_CORE0_TRANSITION_CYCLES           0x03  WBOX
+UMASK_CORE0_TRANSITION_CYCLES           0x00
+
+EVENT_CORE1_TRANSITION_CYCLES           0x04  WBOX
+UMASK_CORE1_TRANSITION_CYCLES           0x00
+
+EVENT_CORE2_TRANSITION_CYCLES           0x05  WBOX
+UMASK_CORE2_TRANSITION_CYCLES           0x00
+
+EVENT_CORE3_TRANSITION_CYCLES           0x06  WBOX
+UMASK_CORE3_TRANSITION_CYCLES           0x00
+
+EVENT_CORE4_TRANSITION_CYCLES           0x07  WBOX
+UMASK_CORE4_TRANSITION_CYCLES           0x00
+
+EVENT_CORE5_TRANSITION_CYCLES           0x08  WBOX
+UMASK_CORE5_TRANSITION_CYCLES           0x00
+
+EVENT_CORE6_TRANSITION_CYCLES           0x09  WBOX
+UMASK_CORE6_TRANSITION_CYCLES           0x00
+
+EVENT_CORE7_TRANSITION_CYCLES           0x0A  WBOX
+UMASK_CORE7_TRANSITION_CYCLES           0x00
+
+EVENT_DEMOTIONS_CORE0           0x1E  WBOX
+OPTIONS_DEMOTIONS_CORE0         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE0           0x00
+
+EVENT_DEMOTIONS_CORE1           0x1F  WBOX
+OPTIONS_DEMOTIONS_CORE1         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE1           0x00
+
+EVENT_DEMOTIONS_CORE2           0x20  WBOX
+OPTIONS_DEMOTIONS_CORE2         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE2           0x00
+
+EVENT_DEMOTIONS_CORE3           0x21  WBOX
+OPTIONS_DEMOTIONS_CORE3         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE3           0x00
+
+EVENT_DEMOTIONS_CORE4           0x22  WBOX
+OPTIONS_DEMOTIONS_CORE4         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE4           0x00
+
+EVENT_DEMOTIONS_CORE5           0x23  WBOX
+OPTIONS_DEMOTIONS_CORE5         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE5           0x00
+
+EVENT_DEMOTIONS_CORE6           0x24  WBOX
+OPTIONS_DEMOTIONS_CORE6         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE6           0x00
+
+EVENT_DEMOTIONS_CORE7           0x25  WBOX
+OPTIONS_DEMOTIONS_CORE7         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE7           0x00
+
+EVENT_FREQ_BAND0_CYCLES           0x0B  WBOX
+OPTIONS_FREQ_BAND0_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES           0x00
+
+EVENT_FREQ_BAND1_CYCLES           0x0C  WBOX
+OPTIONS_FREQ_BAND1_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES           0x00
+
+EVENT_FREQ_BAND2_CYCLES           0x0D  WBOX
+OPTIONS_FREQ_BAND2_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES           0x00
+
+EVENT_FREQ_BAND3_CYCLES           0x0E  WBOX
+OPTIONS_FREQ_BAND3_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES           0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES           0x07  WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES           0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x04  WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES           0x05  WBOX
+UMASK_FREQ_MAX_POWER_CYCLES           0x00
+
+EVENT_FREQ_MAX_OS_CYCLES           0x06  WBOX
+UMASK_FREQ_MAX_OS_CYCLES           0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES           0x01  WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES           0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES           0x02  WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES           0x00
+
+EVENT_FREQ_TRANS_CYCLES           0x00  WBOX
+UMASK_FREQ_TRANS_CYCLES           0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES           0x2F  WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES           0x00
+
+EVENT_POWER_STATE_OCCUPANCY           0x80  WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0           0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3           0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6           0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A  WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09  WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x0B  WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE           0x03  WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE           0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE           0x02  WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE           0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE           0x01  WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE           0x00
+
+EVENT_VR_HOT_CYCLES           0x32  WBOX
+UMASK_VR_HOT_CYCLES           0x00
+
+EVENT_CORES_IN_C3               0x00 WBOXFIX0
+UMASK_CORES_IN_C3               0x00
+
+EVENT_CORES_IN_C6               0x00 WBOXFIX1
+UMASK_CORES_IN_C6               0x00
+
+EVENT_SBOX_CLOCKTICKS           0x14  SBOX
+UMASK_SBOX_CLOCKTICKS           0x00
+
+EVENT_CTO_COUNT           0x38  SBOX
+UMASK_CTO_COUNT           0x00 0x200000
+
+EVENT_DIRECT2CORE           0x13  SBOX
+UMASK_DIRECT2CORE_SUCCESS             0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS     0x02
+UMASK_DIRECT2CORE_FAILURE_RBT         0x04
+UMASK_DIRECT2CORE_FAILURE_CREDIRTS_RBT 0x08
+
+EVENT_L1_POWER_CYCLES           0x12  SBOX
+UMASK_L1_POWER_CYCLES           0x00
+
+EVENT_RXL0P_POWER_CYCLES           0x10  SBOX
+UMASK_RXL0P_POWER_CYCLES           0x00
+
+EVENT_RXL0_POWER_CYCLES           0x0F  SBOX
+UMASK_RXL0_POWER_CYCLES           0x00
+
+EVENT_RXL_BYPASSED           0x09  SBOX
+UMASK_RXL_BYPASSED           0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0           0x1E  SBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS       0x01 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB       0x02 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS       0x04 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM       0x08 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP       0x10 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR       0x20 0x200000
+
+EVENT_RXL_CREDITS_CONSUMED_VNA           0x1D  SBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA           0x00 0x200000
+
+EVENT_RXL_FLITS_G0              0x01  SBOX
+UMASK_RXL_FLITS_G0_IDLE         0x01
+UMASK_RXL_FLITS_G0_DATA         0x02
+UMASK_RXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_RXL_FLITS_G1              0x02  SBOX
+UMASK_RXL_FLITS_G1_SNP          0x01 0x200000
+UMASK_RXL_FLITS_G1_HOM_REQ      0x02 0x200000
+UMASK_RXL_FLITS_G1_HOM_NONREQ   0x04 0x200000
+UMASK_RXL_FLITS_G1_HOM          0x06 0x200000
+UMASK_RXL_FLITS_G1_DRS_DATA     0x08 0x200000
+UMASK_RXL_FLITS_G1_DRS_NONDATA  0x10 0x200000
+UMASK_RXL_FLITS_G1_DRS          0x60 0x200000
+
+EVENT_RXL_FLITS_G2              0x03  SBOX
+UMASK_RXL_FLITS_G2_NDR_AD       0x01 0x200000
+UMASK_RXL_FLITS_G2_NDR_AK       0x02 0x200000
+UMASK_RXL_FLITS_G2_NCB_DATA     0x04 0x200000
+UMASK_RXL_FLITS_G2_NCB_NODATA   0x08 0x200000
+UMASK_RXL_FLITS_G2_NCB          0x06 0x200000
+UMASK_RXL_FLITS_G2_NCS          0x10 0x200000
+
+EVENT_RXL_INSERTS           0x08  SBOX
+UMASK_RXL_INSERTS           0x00
+
+EVENT_RXL_INSERTS_DRS           0x09  SBOX
+UMASK_RXL_INSERTS_DRS           0x00 0x200000
+
+EVENT_RXL_INSERTS_HOM           0x0C  SBOX
+UMASK_RXL_INSERTS_HOM           0x00 0x200000
+
+EVENT_RXL_INSERTS_NCB           0x0A  SBOX
+UMASK_RXL_INSERTS_NCB           0x00 0x200000
+
+EVENT_RXL_INSERTS_NCS           0x0B  SBOX
+UMASK_RXL_INSERTS_NCS           0x00 0x200000
+
+EVENT_RXL_INSERTS_NDR           0x0E  SBOX
+UMASK_RXL_INSERTS_NDR           0x00 0x200000
+
+EVENT_RXL_INSERTS_SNP           0x0D  SBOX
+UMASK_RXL_INSERTS_SNP           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY           0x0B  SBOX
+UMASK_RXL_OCCUPANCY           0x00
+
+EVENT_RXL_OCCUPANCY_DRS           0x15  SBOX
+UMASK_RXL_OCCUPANCY_DRS           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_HOM           0x18  SBOX
+UMASK_RXL_OCCUPANCY_HOM           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCB           0x16  SBOX
+UMASK_RXL_OCCUPANCY_NCB           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCS           0x17  SBOX
+UMASK_RXL_OCCUPANCY_NCS           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NDR           0x1A  SBOX
+UMASK_RXL_OCCUPANCY_NDR           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_SNP           0x19  SBOX
+UMASK_RXL_OCCUPANCY_SNP           0x00 0x200000
+
+EVENT_TXL0P_POWER_CYCLES           0x0D  SBOX
+UMASK_TXL0P_POWER_CYCLES           0x00
+
+EVENT_TXL0_POWER_CYCLES           0x0C  SBOX
+UMASK_TXL0_POWER_CYCLES           0x00
+
+EVENT_TXL_BYPASSED           0x05  SBOX
+UMASK_TXL_BYPASSED           0x00
+
+EVENT_TXL_CYCLES_NE           0x06  SBOX
+UMASK_TXL_CYCLES_NE           0x00
+
+EVENT_TXL_FLITS_G0              0x00  SBOX
+UMASK_TXL_FLITS_G0_IDLE         0x01
+UMASK_TXL_FLITS_G0_DATA         0x02
+UMASK_TXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_TXL_FLITS_G1              0x00  SBOX
+UMASK_TXL_FLITS_G1_SNP          0x01 0x200000
+UMASK_TXL_FLITS_G1_HOM_REQ      0x02 0x200000
+UMASK_TXL_FLITS_G1_HOM_NONREQ   0x04 0x200000
+UMASK_TXL_FLITS_G1_HOM          0x06 0x200000
+UMASK_TXL_FLITS_G1_DRS_DATA     0x08 0x200000
+UMASK_TXL_FLITS_G1_DRS_NONDATA  0x10 0x200000
+UMASK_TXL_FLITS_G1_DRS          0x60 0x200000
+
+EVENT_TXL_FLITS_G2              0x01  SBOX
+UMASK_TXL_FLITS_G2_NDR_AD       0x01 0x200000
+UMASK_TXL_FLITS_G2_NDR_AK       0x02 0x200000
+UMASK_TXL_FLITS_G2_NCB_DATA     0x04 0x200000
+UMASK_TXL_FLITS_G2_NCB_NODATA   0x08 0x200000
+UMASK_TXL_FLITS_G2_NCB          0x06 0x200000
+UMASK_TXL_FLITS_G2_NCS          0x10 0x200000
+
+EVENT_TXL_INSERTS           0x04  SBOX
+UMASK_TXL_INSERTS           0x00
+
+EVENT_TXL_OCCUPANCY           0x07  SBOX
+UMASK_TXL_OCCUPANCY           0x00
+
+EVENT_VNA_CREDIT_RETURNS           0x1C  SBOX
+UMASK_VNA_CREDIT_RETURNS           0x00 0x200000
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY           0x1B  SBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY           0x00 0x200000
+
+EVENT_QPI_RATE                  0x00 SBOX0FIX|SBOX1FIX
+UMASK_QPI_RATE                  0x00
+
+EVENT_QPI_SLOW_MODE             0x01 SBOX0FIX|SBOX1FIX
+UMASK_QPI_SLOW_MODE             0x00
+
+EVENT_PBOX_CLOCKTICKS           0x01  PBOX
+UMASK_PBOX_CLOCKTICKS           0x00
+
+EVENT_RING_AD_USED                  0x07  PBOX
+UMASK_RING_AD_USED_CW_EVEN          0x01
+UMASK_RING_AD_USED_CW_ODD           0x02
+UMASK_RING_AD_USED_CCW_EVEN         0x04
+UMASK_RING_AD_USED_CCW_EVEN         0x08
+
+EVENT_RING_AK_USED                  0x08  PBOX
+UMASK_RING_AK_USED_CW_EVEN          0x01
+UMASK_RING_AK_USED_CW_ODD           0x02
+UMASK_RING_AK_USED_CCW_EVEN         0x04
+UMASK_RING_AK_USED_CCW_EVEN         0x08
+
+EVENT_RING_BL_USED                  0x09  PBOX
+UMASK_RING_BL_USED_CW_EVEN          0x01
+UMASK_RING_BL_USED_CW_ODD           0x02
+UMASK_RING_BL_USED_CCW_EVEN         0x04
+UMASK_RING_BL_USED_CCW_EVEN         0x08
+
+EVENT_RING_IV_USED                  0x0A  PBOX
+UMASK_RING_IV_USED_ANY              0x0F
+
+EVENT_RXR_AK_BOUNCES              0x12  PBOX0
+UMASK_RXR_AK_BOUNCES              0x00
+
+EVENT_RXR_CYCLES_NE              0x10  PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_DRS              0x08
+UMASK_RXR_CYCLES_NE_NCB              0x10
+UMASK_RXR_CYCLES_NE_NCS              0x20
+
+EVENT_TXR_CYCLES_FULL              0x25  PBOX0
+UMASK_TXR_CYCLES_FULL_AD              0x01
+UMASK_TXR_CYCLES_FULL_AK              0x02
+UMASK_TXR_CYCLES_FULL_BL              0x04
+
+EVENT_TXR_CYCLES_NE              0x23  PBOX0
+UMASK_TXR_CYCLES_NE_AD              0x01
+UMASK_TXR_CYCLES_NE_AK              0x02
+UMASK_TXR_CYCLES_NE_BL              0x04
+
+EVENT_TXR_INSERTS         0x24  PBOX0
+UMASK_TXR_INSERTS         0x00
+
+EVENT_RBOX_CLOCKTICKS              0x01  RBOX
+UMASK_RBOX_CLOCKTICKS              0x00
+
+EVENT_IIO_CREDITS_ACQUIRED              0x20  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_ACQUIRED_DRS          0x08
+UMASK_IIO_CREDITS_ACQUIRED_NCB          0x10
+UMASK_IIO_CREDITS_ACQUIRED_NCS          0x20
+
+EVENT_IIO_CREDITS_REJECT                0x21  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_REJECT_DRS            0x08
+UMASK_IIO_CREDITS_REJECT_NCB            0x10
+UMASK_IIO_CREDITS_REJECT_NCS            0x20
+
+EVENT_IIO_CREDITS_USED                  0x22  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_USED_DRS              0x08
+UMASK_IIO_CREDITS_USED_NCB              0x10
+UMASK_IIO_CREDITS_USED_NCS              0x20
+
+EVENT_RING_AD_USED              0x07  RBOX
+UMASK_RING_AD_USED_CW_EVEN      0x01
+UMASK_RING_AD_USED_CW_ODD       0x02
+UMASK_RING_AD_USED_CCW_EVEN     0x04
+UMASK_RING_AD_USED_CCW_ODD      0x08
+
+EVENT_RING_AK_USED              0x08  RBOX
+UMASK_RING_AK_USED_CW_EVEN      0x01
+UMASK_RING_AK_USED_CW_ODD       0x02
+UMASK_RING_AK_USED_CCW_EVEN     0x04
+UMASK_RING_AK_USED_CCW_ODD      0x08
+
+EVENT_RING_BL_USED              0x09  RBOX
+UMASK_RING_BL_USED_CW_EVEN      0x01
+UMASK_RING_BL_USED_CW_ODD       0x02
+UMASK_RING_BL_USED_CCW_EVEN     0x04
+UMASK_RING_BL_USED_CCW_ODD      0x08
+
+EVENT_RING_IV_USED          0x0A  RBOX
+UMASK_RING_IV_USED_ANY      0x0F
+
+EVENT_RXR_BYPASSED          0x12  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_BYPASSED          0x00
+
+EVENT_RXR_CYCLES_NE         0x10  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM     0x01
+UMASK_RXR_CYCLES_NE_SNP     0x02
+UMASK_RXR_CYCLES_NE_NDR     0x04
+UMASK_RXR_CYCLES_NE_DRS     0x08
+UMASK_RXR_CYCLES_NE_NCB     0x10
+UMASK_RXR_CYCLES_NE_NCS     0x20
+
+EVENT_RXR_INSERTS         0x11  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM     0x01
+UMASK_RXR_INSERTS_SNP     0x02
+UMASK_RXR_INSERTS_NDR     0x04
+UMASK_RXR_INSERTS_DRS     0x08
+UMASK_RXR_INSERTS_NCB     0x10
+UMASK_RXR_INSERTS_NCS     0x20
+
+EVENT_RXR_OCCUPANCY         0x13  RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM     0x01
+UMASK_RXR_OCCUPANCY_SNP     0x02
+UMASK_RXR_OCCUPANCY_NDR     0x04
+UMASK_RXR_OCCUPANCY_DRS     0x08
+UMASK_RXR_OCCUPANCY_NCB     0x10
+UMASK_RXR_OCCUPANCY_NCS     0x20
+
+EVENT_TXR_CYCLES_FULL       0x25  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_FULL       0x00
+
+EVENT_TXR_CYCLES_NE       0x23  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_NE       0x00
+
+EVENT_TXR_INSERTS         0x24  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1|PBOX0
+UMASK_TXR_INSERTS_HOM     0x01
+UMASK_TXR_INSERTS_SNP     0x02
+UMASK_TXR_INSERTS_NDR     0x04
+UMASK_TXR_INSERTS_DRS     0x08
+UMASK_TXR_INSERTS_NCB     0x10
+UMASK_TXR_INSERTS_NCS     0x20
+
+EVENT_TXR_NACK       0x26  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK       0x00
+
+EVENT_VN0_CREDITS_REJECT      0x37  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM     0x01
+UMASK_VN0_CREDITS_REJECT_SNP     0x02
+UMASK_VN0_CREDITS_REJECT_NDR     0x04
+UMASK_VN0_CREDITS_REJECT_DRS     0x08
+UMASK_VN0_CREDITS_REJECT_NCB     0x10
+UMASK_VN0_CREDITS_REJECT_NCS     0x20
+
+EVENT_VN0_CREDITS_USED      0x36  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM     0x01
+UMASK_VN0_CREDITS_USED_SNP     0x02
+UMASK_VN0_CREDITS_USED_NDR     0x04
+UMASK_VN0_CREDITS_USED_DRS     0x08
+UMASK_VN0_CREDITS_USED_NCB     0x10
+UMASK_VN0_CREDITS_USED_NCS     0x20
+
+EVENT_VNA_CREDITS_ACQUIRED      0x33  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED     0x00
+
+EVENT_VNA_CREDITS_REJECT      0x34  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM     0x01
+UMASK_VNA_CREDITS_REJECT_SNP     0x02
+UMASK_VNA_CREDITS_REJECT_NDR     0x04
+UMASK_VNA_CREDITS_REJECT_DRS     0x08
+UMASK_VNA_CREDITS_REJECT_NCB     0x10
+UMASK_VNA_CREDITS_REJECT_NCS     0x20
+
+EVENT_VNA_CREDITS_CYCLES_OUT      0x31  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_OUT     0x00
+
+EVENT_VNA_CREDITS_CYCLES_USED      0x32  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_USED     0x00
+
+EVENT_EVENT_MSG                  0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD         0x01
+UMASK_EVENT_MSG_MSI_RCVD         0x02
+UMASK_EVENT_MSG_IPI_RCVD         0x04
+UMASK_EVENT_MSG_DOORBELL_RCVD    0x08
+UMASK_EVENT_MSG_INT_PRIO         0x10
+
+EVENT_LOCK_CYCLES                 0x44 UBOX
+UMASK_LOCK_CYCLES                 0x00
+
+EVENT_UNCORE_CLOCK                 0x0 UBOXFIX
+UMASK_UNCORE_CLOCK                 0x0
diff --git a/src/includes/perfmon_sandybridge_counters.h b/src/includes/perfmon_sandybridge_counters.h
index afe9c04..e8dca5b 100644
--- a/src/includes/perfmon_sandybridge_counters.h
+++ b/src/includes/perfmon_sandybridge_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_sandybridge_counters.h
  *
- *      Description: Counter header file of perfmon module for Sandy Bridge.
+ *      Description: Counter header file of perfmon module for Intel Sandy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,48 +29,60 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_SANDYBRIDGE 32
-#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 12
+
 #define NUM_COUNTERS_CORE_SANDYBRIDGE 8
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 15
+#define NUM_COUNTERS_SANDYBRIDGE 23
+
+#define SNB_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SNB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK| \
+                            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define SNB_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNB_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+
 
-static PerfmonCounterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
+static RegisterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SNB_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SNB_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SNB_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SNB_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
-    /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1C0",PMC16, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C1",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C2",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C3",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2C0",PMC20, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C1",PMC21, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C2",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C3",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3C0",PMC24, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C1",PMC25, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C2",PMC26, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C3",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX0FIX",PMC28, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1FIX",PMC29, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2FIX",PMC30, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL0, MSR_UNC_CBO_0_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_CBO_0_PERFEVTSEL1, MSR_UNC_CBO_0_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC14, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL0, MSR_UNC_CBO_1_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC15, CBOX1, MSR_UNC_CBO_1_PERFEVTSEL1, MSR_UNC_CBO_1_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC16, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL0, MSR_UNC_CBO_2_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC17, CBOX2, MSR_UNC_CBO_2_PERFEVTSEL1, MSR_UNC_CBO_2_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC18, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL0, MSR_UNC_CBO_3_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC19, CBOX3, MSR_UNC_CBO_3_PERFEVTSEL1, MSR_UNC_CBO_3_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC20, UBOX, MSR_UNC_ARB_PERFEVTSEL0, MSR_UNC_ARB_CTR0, 0, 0, SNB_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC21, UBOX, MSR_UNC_ARB_PERFEVTSEL1, MSR_UNC_ARB_CTR1, 0, 0, SNB_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC22, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
 };
 
+static BoxMap sandybridge_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [CBOX0] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX1] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX2] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [CBOX3] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 3, 0, MSR_DEV, 44},
+    [UBOX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 1, 0, MSR_DEV, 44},
+    [UBOXFIX] = {MSR_UNC_PERF_GLOBAL_CTRL, MSR_UNC_PERF_GLOBAL_STATUS, MSR_UNC_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 44},
+};
+
+
 
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index ec4d397..8bab52b 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_sandybridge_events.txt
-# 
+#
 #      Description:  Event list for Intel SandyBridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -26,8 +27,8 @@
 #
 # =======================================================================================
 
-EVENT_TEMP_CORE          0x00   TMP0
-UMASK_TEMP_CORE          0x00
+EVENT_TEMP_CORE               0x00   TMP0
+UMASK_TEMP_CORE               0x00
 
 EVENT_PWR_PKG_ENERGY          0x00   PWR0
 UMASK_PWR_PKG_ENERGY          0x00
@@ -35,17 +36,20 @@ UMASK_PWR_PKG_ENERGY          0x00
 EVENT_PWR_PP0_ENERGY          0x00   PWR1
 UMASK_PWR_PP0_ENERGY          0x00
 
-EVENT_PWR_DRAM_ENERGY          0x00   PWR3
-UMASK_PWR_DRAM_ENERGY          0x00
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
 
-EVENT_INSTR_RETIRED              0x00   FIXC0
-UMASK_INSTR_RETIRED_ANY          0x00
+EVENT_PWR_DRAM_ENERGY         0x00   PWR3
+UMASK_PWR_DRAM_ENERGY         0x00
 
-EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
-UMASK_CPU_CLK_UNHALTED_CORE      0x00
+EVENT_INSTR_RETIRED           0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY       0x00
 
-EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
-UMASK_CPU_CLK_UNHALTED_REF       0x00
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE   0x00
+
+EVENT_CPU_CLK_UNHALTED        0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF    0x00
 
 EVENT_LOAD_BLOCKS                 0x03  PMC
 UMASK_LOAD_BLOCKS_DATA_UNKNOWN    0x01
@@ -58,121 +62,144 @@ UMASK_MISALIGN_MEM_REF_LOAD      0x01
 UMASK_MISALIGN_MEM_REF_STORE     0x02
 UMASK_MISALIGN_MEM_REF_ANY       0x03
 
-EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
-UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
-UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK   0x08  PMC
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK   0x08
 
-EVENT_DTLB_LOAD_MISSES                0x08  PMC
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
 UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x01
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x02
-UMASK_DTLB_LOAD_MISSES_WALK_DURATION  0x04
-
-EVENT_INT_MISC                  0x0D  PMC
-UMASK_INT_MISC_RECOVERY_CYCLES   0x03 0x41 0x01
-UMASK_INT_MISC_STALL_CYCLES     0x40
-
-EVENT_UOPS_ISSUED                  0x0E  PMC
-UMASK_UOPS_ISSUED_ANY           0x01
-
-EVENT_FP_COMP_OPS_EXE            0x10   PMC
-UMASK_FP_COMP_OPS_EXE_X87       0x01
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x04
+
+EVENT_INT_MISC                       0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES       0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT        0x03
+UMASK_INT_MISC_RAT_STALL_CYCLES      0x40
+DEFAULT_OPTIONS_INT_MISC_RAT_STALL_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RAT_STALL_COUNT       0x40
+
+EVENT_UOPS_ISSUED                     0x0E  PMC
+UMASK_UOPS_ISSUED_ANY                 0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES       0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES        0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ANY            0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_ACTIVE_CYCLES  0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+EVENT_FP_COMP_OPS_EXE                          0x10   PMC
+UMASK_FP_COMP_OPS_EXE_X87                      0x01
 UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
 UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
 UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
 UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
 
-EVENT_SIMD_FP_256_PACKED       0x11   PMC
+EVENT_SIMD_FP_256_PACKED            0x11   PMC
 UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
 UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
 
 EVENT_ARITH                      0x14   PMC
 UMASK_ARITH_FPU_DIV_ACTIVE       0x01
-UMASK_ARITH_NUM_DIV              0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_ARITH_NUM_DIV              0x01
 
 EVENT_INSTS_WRITTEN_TO_IQ            0x17   PMC
-UMASK_INSTS_WRITTEN_TO_IQ_INSTS        0x01
-
-EVENT_L2_RQSTS                   0x24   PMC
-UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
-UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
-UMASK_L2_RQSTS_RFO_HITS           0x04
-UMASK_L2_RQSTS_RFO_MISS          0x08
-UMASK_L2_RQSTS_RFO_ANY           0x0C
-UMASK_L2_RQSTS_CODE_RD_HITS        0x10
-UMASK_L2_RQSTS_CODE_RD_MISS       0x20
-UMASK_L2_RQSTS_ALL_CODE_CODE_RD   0x30
-UMASK_L2_RQSTS_PF_HIT      0x40
-UMASK_L2_RQSTS_PF_MISS     0x80
-UMASK_L2_RQSTS_ALL_PF        0xC0
-UMASK_L2_RQSTS_MISS              0xAA
+UMASK_INSTS_WRITTEN_TO_IQ_INSTS      0x01
+
+EVENT_L2_RQSTS                          0x24   PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT  0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD      0x03
+UMASK_L2_RQSTS_RFO_HITS                 0x04
+UMASK_L2_RQSTS_RFO_MISS                 0x08
+UMASK_L2_RQSTS_RFO_ANY                  0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS             0x10
+UMASK_L2_RQSTS_CODE_RD_MISS             0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD         0x30
+UMASK_L2_RQSTS_PF_HIT                   0x40
+UMASK_L2_RQSTS_PF_MISS                  0x80
+UMASK_L2_RQSTS_ALL_PF                   0xC0
+UMASK_L2_RQSTS_MISS                     0xAA
 
 EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
 UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
-UMASK_L2_STORE_LOCK_RQSTS_HIT_E       0x04
-UMASK_L2_STORE_LOCK_RQSTS_HIT_M       0x08
+UMASK_L2_STORE_LOCK_RQSTS_HIT_E      0x04
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M      0x08
 UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
 
 EVENT_L1D_WB_RQST                  0x28   PMC
-UMASK_L1D_WB_RQST_HIT_E          0x04
-UMASK_L1D_WB_RQST_HIT_M          0x08
+UMASK_L1D_WB_RQST_HIT_E            0x04
+UMASK_L1D_WB_RQST_HIT_M            0x08
 
 EVENT_L3_LAT_CACHE               0x2E   PMC
 UMASK_L3_LAT_CACHE_REFERENCE     0x4F
 UMASK_L3_LAT_CACHE_MISS          0x41
 
-EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
 UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
 
 EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
 
-EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
-UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
-UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x04
-UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
+EVENT_DTLB_STORE_MISSES                 0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED  0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION   0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT        0x10
 
-EVENT_LOAD_HIT_PRE               0x4C    PMC
+EVENT_LOAD_HIT_PRE                     0x4C    PMC
 UMASK_LOAD_HIT_PRE_SW_PF               0x01
 UMASK_LOAD_HIT_PRE_HW_PF               0x02
 
 EVENT_HW_PRE_REQ               0x4E    PMC
 UMASK_HW_PRE_REQ_DL1_MISS      0x02
 
-EVENT_L1D                        0x51   PMC
+EVENT_L1D                         0x51   PMC
 UMASK_L1D_REPLACEMENT             0x01
 UMASK_L1D_ALLOCATED_IN_M          0x02
 UMASK_L1D_M_EVICT                 0x04
 UMASK_L1D_ALL_M_REPLACEMENT       0x08
 
-EVENT_PARTIAL_RAT_STALLS               0x59    PMC
+EVENT_PARTIAL_RAT_STALLS                   0x59    PMC
 UMASK_PARTIAL_RAT_STALLS_FLAGS_MERGE_UOP   0x20
 UMASK_PARTIAL_RAT_STALLS_SLOW_LEA_WINDOW   0x40
-UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP   0x80
+UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP    0x80
 
-EVENT_RESOURCE_STALLS2               0x5B    PMC
+EVENT_RESOURCE_STALLS2                  0x5B    PMC
 UMASK_RESOURCE_STALLS2_ALL_FL_EMPTY     0x0C
-UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL     0x0F
-UMASK_RESOURCE_STALLS2_BOB_FULL     0x40
-UMASK_RESOURCE_STALLS2_OOO_RSRC     0x4F
+UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL  0x0F
+UMASK_RESOURCE_STALLS2_BOB_FULL         0x40
+UMASK_RESOURCE_STALLS2_OOO_RSRC         0x4F
 
 EVENT_CPL_CYCLES               0x5C    PMC
-UMASK_CPL_CYCLES_RING0             0x01
-UMASK_CPL_CYCLES_RING123             0x02
+UMASK_CPL_CYCLES_RING0         0x01
+UMASK_CPL_CYCLES_RING123       0x02
 
-EVENT_RS_EVENTS               0x5E    PMC
+EVENT_RS_EVENTS                 0x5E    PMC
 UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
 
-EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
-UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
 
-EVENT_CACHE_LOCK_CYCLES          0x63   PMC
-UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
-UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
+EVENT_CACHE_LOCK_CYCLES                             0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
 
 EVENT_IDQ               0x79   PMC
 UMASK_IDQ_EMPTY         0x02
@@ -182,8 +209,8 @@ UMASK_IDQ_MS_DSB_UOPS   0x10
 UMASK_IDQ_MS_MITE_UOPS  0x20
 UMASK_IDQ_MS_UOPS       0x30
 
-EVENT_ICACHE                  0x80   PMC
-UMASK_ICACHE_HITS             0x01
+EVENT_ICACHE                    0x80   PMC
+UMASK_ICACHE_HITS               0x01
 UMASK_ICACHE_MISSES             0x02
 UMASK_ICACHE_ACCESSES           0x03
 UMASK_ICACHE_IFETCH_STALL       0x04
@@ -192,54 +219,58 @@ EVENT_ITLB_MISSES                 0x85      PMC
 UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
 UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
 UMASK_ITLB_MISSES_WALK_DURATION   0x04
-UMASK_ITLB_MISSES_STLB_HIT   0x10
+UMASK_ITLB_MISSES_STLB_HIT        0x10
 
 EVENT_ILD_STALL                 0x87      PMC
 UMASK_ILD_STALL_LCP             0x01
 UMASK_ILD_STALL_IQ_FULL         0x04
 
-EVENT_BR_INST_EXEC               0x88   PMC
-UMASK_BR_INST_EXEC_COND_TAKEN          0x81
-UMASK_BR_INST_EXEC_COND_NON_TAKEN      0x41
-UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN        0x82
-UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN        0x42
+EVENT_BR_INST_EXEC                                     0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                          0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                      0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                    0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                0x42
 UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN     0x84
-UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN     0x44
-UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN           0x88
-UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN           0x48
-UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN      0x90
-UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN      0x50
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN    0xA0 
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN    0x60 
-UMASK_BR_INST_EXEC_ALL_BRANCHES                   0xFF 
-
-EVENT_BR_MISP_EXEC                    0x89   PMC
-UMASK_BR_MISP_EXEC_COND_TAKEN               0x81
-UMASK_BR_MISP_EXEC_COND_NON_TAKEN               0x41
-UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN  0x84
-UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
-UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN        0x88
-UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN        0x48
-UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN   0x90
-UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN   0x50
-UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
-UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
-UMASK_BR_MISP_EXEC_ALL_BRANCHES       0xFF
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                   0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN               0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN              0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN          0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN            0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN        0x60
+UMASK_BR_INST_EXEC_ALL_BRANCHES                        0xFF
+
+EVENT_BR_MISP_EXEC                                     0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                          0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                      0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN     0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN 0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                   0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN               0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN              0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN          0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN            0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN        0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                        0xFF
 
 EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
 UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
 
-EVENT_UOPS_DISPATCHED_PORT                 0xA1   PMC
+EVENT_UOPS_DISPATCHED_PORT                  0xA1   PMC
 UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
 UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
 UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
 UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
 UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD           0x10
-UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA           0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD        0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA       0x20
 UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
 UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
 UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+DEFAULT_OPTIONS_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x83
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS       0x7C
+UMASK_UOPS_DISPATCHED_PORT_ALL_PORTS        0xFF
 
 EVENT_RESOURCE_STALLS                 0xA2   PMC
 UMASK_RESOURCE_STALLS_ANY             0x01
@@ -258,48 +289,99 @@ UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES   0x02
 EVENT_DSB_FILL                         0xAC   PMC
 UMASK_DSB_FILL_OTHER_CANCEL            0x02
 UMASK_DSB_FILL_EXCEED_DSB_LINES        0x08
-UMASK_DSB_FILL_ALL_CANCEL        0x0A
+UMASK_DSB_FILL_ALL_CANCEL              0x0A
 
 EVENT_ITLB                         0xAE   PMC
-UMASK_ITLB_ITLB_FLUSH            0x01
+UMASK_ITLB_ITLB_FLUSH              0x01
 
-EVENT_OFFCORE_REQUESTS     0xB0   PMC
+EVENT_OFFCORE_REQUESTS                  0xB0   PMC
 UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
 UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
 UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
 
-EVENT_UOPS_DISPATCHED               0xB1   PMC
-UMASK_UOPS_DISPATCHED_THREAD            0x01
-UMASK_UOPS_DISPATCHED_CORE              0x02
-
-EVENT_OFFCORE_REQUESTS_BUFFER     0xB2  PMC
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                       0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER             0xB2  PMC
 UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL     0x01
 
-EVENT_AGU_BYPASS_CANCEL          0xB6  PMC
+EVENT_AGU_BYPASS_CANCEL           0xB6  PMC
 UMASK_AGU_BYPASS_CANCEL_COUNT     0x01
 
-EVENT_TLB_FLUSH          0xBD  PMC
+EVENT_TLB_FLUSH                 0xBD  PMC
 UMASK_TLB_FLUSH_DTLB_THREAD     0x01
 UMASK_TLB_FLUSH_STLB_ANY        0x20
 
-EVENT_L1D_BLOCKS          0xBF  PMC
-UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES    0x05 0x41 0x01
+EVENT_L1D_BLOCKS                         0xBF  PMC
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES    0x05
+DEFAULT_OPTIONS_L1D_BLOCKS_BANK_CONFLICT_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_L1D_BLOCKS_BANK_CONFLICT_COUNT     0x05
 
-EVENT_INST_RETIRED                  0xC0  PMC0
-UMASK_INST_RETIRED_ANY_P            0x00
+EVENT_INST_RETIRED                        0xC0  PMC0
+UMASK_INST_RETIRED_ANY_P                  0x00
 UMASK_INST_RETIRED_PREC_DIST              0x01
 
-EVENT_OTHER_ASSISTS                  0xC1  PMC
+EVENT_OTHER_ASSISTS                       0xC1  PMC
 UMASK_OTHER_ASSISTS_ITLB_MISS_RETIRED     0x02
 UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x10
 UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
 
-EVENT_UOPS_RETIRED                  0xC2  PMC
-UMASK_UOPS_RETIRED_ALL              0x01
-UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
-
-EVENT_MACHINE_CLEARS              0xC3  PMC
+EVENT_UOPS_RETIRED                       0xC2  PMC
+UMASK_UOPS_RETIRED_ALL                   0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL              0x01
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
 UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
 UMASK_MACHINE_CLEARS_SMC                0x04
 UMASK_MACHINE_CLEARS_MASKMOV            0x20
@@ -308,7 +390,6 @@ EVENT_BR_INST_RETIRED               0xC4  PMC
 UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
 UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
 UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
 UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
 UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
 UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
@@ -316,18 +397,17 @@ UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
 
 EVENT_BR_MISP_RETIRED               0xC5  PMC
 UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
+UMASK_BR_MISP_RETIRED_CONDITIONAL   0x01
 UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
-UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
-UMASK_BR_MISP_RETIRED_TAKEN      0x20
+UMASK_BR_MISP_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_MISP_RETIRED_TAKEN         0x20
 
 EVENT_FP_ASSIST               0xCA  PMC
-UMASK_FP_ASSIST_X87_OUTPUT               0x02
-UMASK_FP_ASSIST_X87_INPUT                0x04
-UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
-UMASK_FP_ASSIST_SIMD_INPUT               0x10
-UMASK_FP_ASSIST_ANY               0x1E
+UMASK_FP_ASSIST_X87_OUTPUT    0x02
+UMASK_FP_ASSIST_X87_INPUT     0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT   0x08
+UMASK_FP_ASSIST_SIMD_INPUT    0x10
+UMASK_FP_ASSIST_ANY           0x1E
 
 EVENT_HW_INTERRUPTS_RECEIVED               0xCB  PMC
 UMASK_HW_INTERRUPTS_RECEIVED               0x01
@@ -335,29 +415,30 @@ UMASK_HW_INTERRUPTS_RECEIVED               0x01
 EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
 UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
 
-EVENT_MEM_UOP_RETIRED            0xD0    PMC
-UMASK_MEM_UOP_RETIRED_LOADS            0x81
-UMASK_MEM_UOP_RETIRED_STORES           0x82
-UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
-UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
-UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
-UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
-UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
-UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+EVENT_MEM_UOPS_RETIRED                  0xD0    PMC
+UMASK_MEM_UOPS_RETIRED_LOADS            0x81
+UMASK_MEM_UOPS_RETIRED_STORES           0x82
+UMASK_MEM_UOPS_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOPS_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOPS_RETIRED_LOADS_LOCK       0x21
+UMASK_MEM_UOPS_RETIRED_STORES_LOCK      0x22
+UMASK_MEM_UOPS_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOPS_RETIRED_STORES_SPLIT     0x42
 
 EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
-UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
-UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
-UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
-UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
-
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT        0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS       0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL        0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT        0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS       0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL        0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT        0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS       0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL        0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB       0x40
+UMASK_MEM_LOAD_UOPS_RETIRED_ALL           0x7F
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED                   0xD2   PMC
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
@@ -366,152 +447,125 @@ UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
 EVENT_MEM_LOAD_UOPS_MISC_RETIRED               0xD4   PMC
 UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS      0x02
 
-EVENT_L2_TRANS               0xF0  PMC
-UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
-UMASK_L2_TRANS_RFO           0x02
-UMASK_L2_TRANS_CODE_RD       0x04
-UMASK_L2_TRANS_ALL_PREF      0x08
-UMASK_L2_TRANS_L1D_WB        0x10
-UMASK_L2_TRANS_L2_FILL       0x20
-UMASK_L2_TRANS_L2_WB         0x40
-UMASK_L2_TRANS_ALL_REQUESTS  0x80
-
-EVENT_L2_LINES_IN                   0xF1   PMC
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PREF       0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
 UMASK_L2_LINES_IN_I           0x01
-UMASK_L2_LINES_IN_S            0x02
+UMASK_L2_LINES_IN_S           0x02
 UMASK_L2_LINES_IN_E           0x04
-UMASK_L2_LINES_IN_ALL               0x07
+UMASK_L2_LINES_IN_ALL         0x07
 
 EVENT_L2_LINES_OUT                  0xF2   PMC
 UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
 UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
-UMASK_L2_LINES_OUT_PF_CLEAN   0x04
-UMASK_L2_LINES_OUT_PF_DIRTY   0x08
-UMASK_L2_LINES_OUT_DIRTY_ALL              0x0A
+UMASK_L2_LINES_OUT_PF_CLEAN         0x04
+UMASK_L2_LINES_OUT_PF_DIRTY         0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL        0x0A
+UMASK_L2_LINES_OUT_CLEAN_ALL        0x05
+UMASK_L2_LINES_OUT_ALL              0x0F
 
 EVENT_SQ_MISC                         0xF4  PMC
 UMASK_SQ_MISC_SPLIT_LOCK              0x10
 
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2  PMC
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS              0x01
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT              0x02
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM              0x04
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NON              0x08
-
-EVENT_MEM_TRANS_RETIRED_LOAD_LATENCY          0xCD  PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY           0x01
-
-EVENT_MEM_LOAD_UOPS_RETIRED          0xD1  PMC
-UMASK_MEM_LOAD_UOPS_RETIRED_LLC_HIT           0x04
-UMASK_MEM_LOAD_UOPS_RETIRED_LLC_MISS           0x20
-
-EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED          0xD2  PMC
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS           0x01
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT            0x02
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM           0x04
-UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE           0x08
-
-EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED          0xD3  PMC
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM           0x01
-UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM            0x04
-
-EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
-UMASK_DRAM_CLOCKTICKS             0x00
-
-EVENT_ACT_COUNT                  0x01  MBOX
-UMASK_ACT_COUNT                  0x00
-
-EVENT_CAS_COUNT                  0x04  MBOX
-UMASK_CAS_COUNT_RD_REF           0x01
-UMASK_CAS_COUNT_RD_UNDERFILL     0x02
-UMASK_CAS_COUNT_RD               0x03
-UMASK_CAS_COUNT_WR_WMM           0x04
-UMASK_CAS_COUNT_WR_RMM           0x08
-UMASK_CAS_COUNT_WR               0x0C
-UMASK_CAS_COUNT_ALL              0x0F
-
-EVENT_DRAM_PRE_ALL                  0x06  MBOX
-UMASK_DRAM_PRE_ALL                  0x00
-
-EVENT_DRAM_REFRESH                  0x05  MBOX
-UMASK_DRAM_REFRESH_PANIC            0x02
-UMASK_DRAM_REFRESH_HIGH             0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
-UMASK_ECC_CORRECTABLE_ERRORS           0x00
-
-EVENT_MAJOR_MODES                  0x07  MBOX
-UMASK_MAJOR_MODES_READ             0x01
-UMASK_MAJOR_MODES_WRITE            0x02
-UMASK_MAJOR_MODES_PARTIAL          0x04
-UMASK_MAJOR_MODES_ISOCH            0x08
-
-EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
-UMASK_POWER_CHANNEL_DLLOFF           0x00
-
-EVENT_POWER_CHANNEL_PPD           0x85  MBOX
-UMASK_POWER_CHANNEL_PPD           0x00
-
-EVENT_POWER_CKE_CYCLES                  0x83  MBOX
-UMASK_POWER_CKE_CYCLES_RANK0            0x01
-UMASK_POWER_CKE_CYCLES_RANK1            0x02
-UMASK_POWER_CKE_CYCLES_RANK2            0x04
-UMASK_POWER_CKE_CYCLES_RANK3            0x08
-UMASK_POWER_CKE_CYCLES_RANK4            0x10
-UMASK_POWER_CKE_CYCLES_RANK5            0x20
-UMASK_POWER_CKE_CYCLES_RANK6            0x40
-UMASK_POWER_CKE_CYCLES_RANK7            0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
-
-EVENT_POWER_SELF_REFRESH           0x43  MBOX
-UMASK_POWER_SELF_REFRESH           0x00
-
-EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
-
-EVENT_PREEMPTION           0x08  MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
-
-EVENT_PRE_COUNT           0x02  MBOX
-UMASK_PRE_COUNT_PAGE_MISS           0x01
-UMASK_PRE_COUNT_PAGE_CLOSE           0x02
-
-EVENT_RPQ_CYCLES_FULL           0x12  MBOX
-UMASK_RPQ_CYCLES_FULL           0x00
-
-EVENT_RPQ_CYCLES_NE           0x11  MBOX
-UMASK_RPQ_CYCLES_NE           0x00
-
-EVENT_RPQ_INSERTS           0x10  MBOX
-UMASK_RPQ_INSERTS           0x00
-
-EVENT_RPQ_OCCUPANCY           0x80  MBOX
-UMASK_RPQ_OCCUPANCY           0x00
-
-EVENT_WPQ_CYCLES_FULL           0x22  MBOX
-UMASK_WPQ_CYCLES_FULL           0x00
-
-EVENT_WPQ_CYCLES_NE           0x21  MBOX
-UMASK_WPQ_CYCLES_NE           0x00
-
-EVENT_WPQ_INSERTS           0x20  MBOX
-UMASK_WPQ_INSERTS           0x00
-
-EVENT_WPQ_OCCUPANCY           0x81  MBOX
-UMASK_WPQ_OCCUPANCY           0x00
-
-EVENT_WPQ_READ_HIT           0x23  MBOX
-UMASK_WPQ_READ_HIT           0x00
-
-EVENT_WPQ_WRITE_HIT           0x24  MBOX
-UMASK_WPQ_WRITE_HIT           0x00
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CACHE_LOOKUP                          0x34 CBOX
+UMASK_CACHE_LOOKUP_M                        0x01
+UMASK_CACHE_LOOKUP_E                        0x02
+UMASK_CACHE_LOOKUP_S                        0x04
+UMASK_CACHE_LOOKUP_I                        0x08
+UMASK_CACHE_LOOKUP_READ_FILTER              0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER             0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER            0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER       0x80
+UMASK_CACHE_LOOKUP_READ_M                   0x11
+UMASK_CACHE_LOOKUP_WRITE_M                  0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M                 0x41
+UMASK_CACHE_LOOKUP_ANY_M                    0x81
+UMASK_CACHE_LOOKUP_READ_E                   0x12
+UMASK_CACHE_LOOKUP_WRITE_E                  0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E                 0x42
+UMASK_CACHE_LOOKUP_ANY_E                    0x82
+UMASK_CACHE_LOOKUP_READ_S                   0x14
+UMASK_CACHE_LOOKUP_WRITE_S                  0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S                 0x44
+UMASK_CACHE_LOOKUP_ANY_S                    0x84
+UMASK_CACHE_LOOKUP_READ_ES                  0x16
+UMASK_CACHE_LOOKUP_WRITE_ES                 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES                0x46
+UMASK_CACHE_LOOKUP_ANY_ES                   0x86
+UMASK_CACHE_LOOKUP_READ_I                   0x18
+UMASK_CACHE_LOOKUP_WRITE_I                  0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I                 0x48
+UMASK_CACHE_LOOKUP_ANY_I                    0x88
+UMASK_CACHE_LOOKUP_READ_MESI                0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI               0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI              0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI                 0x8F
+
+EVENT_XSNP_RESPONSE                         0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL           0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE              0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION           0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL            0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE               0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION            0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL           0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE              0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION           0x88
+
+EVENT_TRK_OCCUPANCY_ALL                     0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL                     0x01
+
+EVENT_TRK_REQUESTS                          0x81 UBOX
+UMASK_TRK_REQUESTS_ALL                      0x01
+UMASK_TRK_REQUESTS_WRITES                   0x20
+
+EVENT_COH_TRK_OCCUPANCY                     0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY                     0x01
+
+EVENT_COH_TRK_REQUESTS                      0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL                  0x01
+
+EVENT_UNCORE_CLOCK                          0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                          0x01
diff --git a/src/includes/perfmon_silvermont.h b/src/includes/perfmon_silvermont.h
index 9cfd6f1..980d528 100644
--- a/src/includes/perfmon_silvermont.h
+++ b/src/includes/perfmon_silvermont.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  perfmon_silvermont.h
  *
- *      Description:  Header file of perfmon module for Intel Atom Silvermont
+ *      Description:  Header file of perfmon module for Intel Atom (Silvermont)
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,264 +29,491 @@
  */
  
 #include <perfmon_silvermont_events.h>
-#include <perfmon_silvermont_groups.h>
 #include <perfmon_silvermont_counters.h>
 
 static int perfmon_numCountersSilvermont = NUM_COUNTERS_SILVERMONT;
-static int perfmon_numGroupsSilvermont = NUM_GROUPS_SILVERMONT;
+static int perfmon_numCoreCountersSilvermont = NUM_COUNTERS_SILVERMONT;
 static int perfmon_numArchEventsSilvermont = NUM_ARCH_EVENTS_SILVERMONT;
 
 
-void perfmon_init_silvermont(PerfmonThread *thread)
+int perfmon_init_silvermont(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
     lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
-    msr_write(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL);
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
 }
 
-void perfmon_setupCounterThread_silvermont(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+uint32_t svm_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags;
-    uint64_t reg = silvermont_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    switch (silvermont_counter_map[index].type)
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-
-            flags = (1<<16)|(1<<22);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+        for(int i=0;i<event->numberOfOptions;i++)
+        {
+            switch(event->options[i].type)
+            {
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<(2+(index*4)));
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<(index*4));
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return flags;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int svm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
 
 
+    flags |= (1ULL<<16)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    /* For event id 0xB7 the cmask must be written in an extra register */
+    if ((event->cmask != 0x00) && (event->eventId != 0xB7))
+    {
+        flags |= (event->cmask << 24);
+    }
+    /* set custom cfgbits */
+    if ((event->cfgBits != 0x00) && (event->eventId != 0xB7))
+    {
+        flags |= (event->cfgBits << 16);
+    }
 
-            if (perfmon_verbose)
+    if (event->numberOfOptions > 0)
+    {
+        for(int i=0;i<event->numberOfOptions;i++)
+        {
+            switch(event->options[i].type)
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[i].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[i].value & 0xFFFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if ((event->eventId == 0xB7) && (event->umask == 0x01))
+                    {
+                        offcore_flags |= (event->options[i].value & 0x768005ULL)<<16;
+                    }
+                    else if ((event->eventId == 0xB7) && (event->umask == 0x02))
+                    {
+                        offcore_flags |= (event->options[i].value & 0x368005ULL)<<16;
+                    }
+                    break;
+                default:
+                    break;
             }
-            msr_write(cpu_id, reg , flags);
-
-            // Offcore event with additional configuration register
-            // We included the additional register as counterRegister2
-            // to avoid creating a new data structure
-            // cfgBits contain offset of "request type" bit
-            // cmask contain offset of "response type" bit
-            if (event->eventId == 0xB7) 
+        }
+    }
+
+    // Offcore event with additional configuration register
+    // cfgBits contain offset of "request type" bit
+    // cmask contain offset of "response type" bit
+    if (event->eventId == 0xB7)
+    {
+        uint32_t reg = 0x0;
+        if (event->umask == 0x01)
+        {
+            reg = MSR_OFFCORE_RESP0;
+        }
+        else if (event->umask == 0x02)
+        {
+            reg = MSR_OFFCORE_RESP1;
+        }
+        if (reg)
+        {
+            if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
             {
-                if (event->umask == 0x01)
-                {
-                    reg = MSR_OFFCORE_RESP0;
-                }
-                else if (event->umask == 0x02)
-                {
-                    reg = MSR_OFFCORE_RESP1;
-                }
-                flags = 0x0ULL;
-                flags = (1<<event->cfgBits)|(1<<event->cmask);
-                msr_write(cpu_id, reg , flags);
+                offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
             }
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , offcore_flags));
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int perfmon_setupCountersThread_silvermont(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL));
+    }
 
-            break;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        flags = 0x0ULL;
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                svm_pmc_setup(cpu_id, index, event);
+                break;
 
-        case FIXED:
-            fixed_flags |= (2ULL<<(index*4));
-            break;
+            case FIXED:
+                fixed_flags |= svm_fixed_setup(cpu_id, index, event);
+                break;
 
-        case POWER:
-            break;
+            case POWER:
+                break;
 
-        default:
-            /* should never be reached */
-            break;
+            default:
+                break;
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    if (fixed_flags > 0x0)
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_silvermont(int thread_id)
+
+
+int perfmon_startCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
+    uint64_t tmp;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    uint64_t fixed_flags = 0x0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (silvermont_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
                     }
-
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
         }
     }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-    if (flags != 0x0ULL)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
+
+    return 0;
 }
 
 
-void perfmon_stopCountersThread_silvermont(int thread_id)
+int perfmon_stopCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-    for ( int i=0; i < perfmon_numCountersSilvermont; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            switch (silvermont_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
                 case PMC:
-
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                    (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        (double)msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index + 32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+                        }
+                    }
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, silvermont_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
-
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
-    {
-        printf ("Overflow occured \n");
-    }
+    return 0;
 }
 
-void perfmon_readCountersThread_silvermont(int thread_id)
+int perfmon_readCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((silvermont_counter_map[i].type == PMC) ||
-                    (silvermont_counter_map[i].type == FIXED))
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
-                if(haveLock)
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                    (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index + 32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+                        }
+                    }
+                    break;
+
+                case POWER:
+                    if(haveLock)
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+                    break;
+
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+    return 0;
+}
+
+
+int perfmon_finalizeCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (event->eventId == 0xB7))
                 {
-                    switch (silvermont_counter_map[i].type)
+                    if (event->umask == 0x1)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, silvermont_counter_map[i].counterRegister);
-                            break;
-
-                        default:
-                            /* should never be reached */
-                            break;
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if (event->umask == 0x2)
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
                     }
                 }
-            }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && ((dev == MSR_DEV) || (haveLock)))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    return 0;
 }
diff --git a/src/includes/perfmon_silvermont_counters.h b/src/includes/perfmon_silvermont_counters.h
index 266ee4b..f04c87b 100644
--- a/src/includes/perfmon_silvermont_counters.h
+++ b/src/includes/perfmon_silvermont_counters.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  perfmon_silvermont_counters.h
  *
- *      Description: Counter header file of perfmon module for Silvermont.
+ *      Description: Counter header file of perfmon module for Intel Atom (Silvermont)
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -27,24 +27,33 @@
  *
  * =======================================================================================
  */
+#include <registers.h>
 
 #define NUM_COUNTERS_CORE_SILVERMONT 6
 #define NUM_COUNTERS_UNCORE_SILVERMONT 0
 #define NUM_COUNTERS_SILVERMONT 8
 
-static PerfmonCounterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
+#define SVM_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SVM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SVM_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SVM_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SVM_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, MSR_OFFCORE_RESP0, 0, SVM_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, MSR_OFFCORE_RESP1, 0, SVM_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC5, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC5, THERMAL, 0, IA32_THERM_STATUS, 0, 0},
     /* RAPL counters */
     {"PWR0", PMC6, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
+    {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0}
 };
 
-
+static BoxMap silvermont_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32}
+};
diff --git a/src/includes/perfmon_silvermont_events.txt b/src/includes/perfmon_silvermont_events.txt
index b8a088d..5b2d1a7 100644
--- a/src/includes/perfmon_silvermont_events.txt
+++ b/src/includes/perfmon_silvermont_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_silvermont_events.txt
-# 
+#
 #      Description:  Event list for Intel Atom (Silvermont)
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -64,12 +64,16 @@ UMASK_MEM_UOPS_RETIRED_HITM         0x20
 UMASK_MEM_UOPS_RETIRED_ALL_LOADS    0x40
 UMASK_MEM_UOPS_RETIRED_ALL_STORES   0x80
 
-EVENT_PAGE_WALKS                    0x05 PMC
-UMASK_PAGE_WALKS_D_SIDE_CYCLES      0x01
-UMASK_PAGE_WALKS_I_SIDE_CYCLES      0x02
-UMASK_PAGE_WALKS_WALKS              0x03
+EVENT_PAGE_WALKS                    0x05  PMC
+UMASK_PAGE_WALKS_DTLB_COUNT         0x01 0x04 0x00
+UMASK_PAGE_WALKS_DTLB_CYCLES        0x01
+UMASK_PAGE_WALKS_ITLB_COUNT         0x02 0x04 0x00
+UMASK_PAGE_WALKS_ITLB_CYCLES        0x02
+UMASK_PAGE_WALKS_COUNT              0x03 0x04 0x00
+UMASK_PAGE_WALKS_CYCLES             0x03
+
 
-EVENT_LONGEST_LAT_CACHE             0x2E PMC
+EVENT_LONGEST_LAT_CACHE             0x2E  PMC
 UMASK_LONGEST_LAT_CACHE_MISS        0x41
 UMASK_LONGEST_LAT_CACHE_REFERENCE   0x4F
 
@@ -83,305 +87,15 @@ EVENT_CPU_CLK_UNHALTED              0x3C PMC
 UMASK_CPU_CLK_UNHALTED_CORE_P       0x00
 UMASK_CPU_CLK_UNHALTED_REF_P        0x01
 
-EVENT_ICACHE                        0x80 PMC
-UMASK_ICACHE_HIT                    0x01
-UMASK_ICACHE_MISSES                 0x02
-UMASK_ICACHE_ACCESSES               0x03
-UMASK_ICACHE_IFETCH_STALL           0x04
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
 
 EVENT_NIP_STALL                     0xB6 PMC
 UMASK_NIP_STALL_ICACHE_MISS         0x04
 
-EVENT_OFFCORE_RESPONSE              0xB7 PMC
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_L2_HIT        0x01 0x00 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNP_NONE      0x01 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_MISS    0x01 0x00 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_HIT     0x01 0x00 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_HITM          0x01 0x00 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_NON_DRAM      0x01 0x00 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT       0x01 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY           0x01 0x01 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_L2_HIT        0x01 0x01 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNP_NONE      0x01 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_MISS    0x01 0x01 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_HIT     0x01 0x01 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_HITM          0x01 0x01 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_NON_DRAM      0x01 0x01 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_AVG_LAT       0x01 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_ANY           0x01 0x02 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_L2_HIT        0x01 0x02 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNP_NONE      0x01 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_MISS    0x01 0x02 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_HIT     0x01 0x02 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_HITM          0x01 0x02 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_NON_DRAM      0x01 0x02 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_AVG_LAT       0x01 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_0_WB_ANY           0x01 0x03 0x10
-UMASK_OFFCORE_RESPONSE_0_WB_L2_HIT        0x01 0x03 0x12
-UMASK_OFFCORE_RESPONSE_0_WB_SNP_NONE      0x01 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_MISS    0x01 0x03 0x21
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_HIT     0x01 0x03 0x22
-UMASK_OFFCORE_RESPONSE_0_WB_HITM          0x01 0x03 0x24
-UMASK_OFFCORE_RESPONSE_0_WB_NON_DRAM      0x01 0x03 0x25
-UMASK_OFFCORE_RESPONSE_0_WB_AVG_LAT       0x01 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_ANY           0x01 0x04 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_L2_HIT        0x01 0x04 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNP_NONE      0x01 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_MISS    0x01 0x04 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_HIT     0x01 0x04 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_HITM          0x01 0x04 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_NON_DRAM      0x01 0x04 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_AVG_LAT       0x01 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_ANY           0x01 0x05 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_L2_HIT        0x01 0x05 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNP_NONE      0x01 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_MISS    0x01 0x05 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_HIT     0x01 0x05 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_HITM          0x01 0x05 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_NON_DRAM      0x01 0x05 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_AVG_LAT       0x01 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_ANY           0x01 0x06 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_L2_HIT        0x01 0x06 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNP_NONE      0x01 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_MISS    0x01 0x06 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_HIT     0x01 0x06 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_HITM          0x01 0x06 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_NON_DRAM      0x01 0x06 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_AVG_LAT       0x01 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY           0x01 0x07 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_L2_HIT        0x01 0x07 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNP_NONE      0x01 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_MISS    0x01 0x07 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_HIT     0x01 0x07 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_HITM          0x01 0x07 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_NON_DRAM      0x01 0x07 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_AVG_LAT       0x01 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY           0x01 0x08 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_L2_HIT        0x01 0x08 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNP_NONE      0x01 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_MISS    0x01 0x08 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_HIT     0x01 0x08 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_HITM          0x01 0x08 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_NON_DRAM      0x01 0x08 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_AVG_LAT       0x01 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_ANY           0x01 0x09 0x10
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_L2_HIT        0x01 0x09 0x12
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNP_NONE      0x01 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_MISS    0x01 0x09 0x21
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_HIT     0x01 0x09 0x22
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_HITM          0x01 0x09 0x24
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_NON_DRAM      0x01 0x09 0x25
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_AVG_LAT       0x01 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY           0x01 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_L2_HIT        0x01 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNP_NONE      0x01 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_MISS    0x01 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_HIT     0x01 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_HITM          0x01 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_NON_DRAM      0x01 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_AVG_LAT       0x01 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_ANY           0x01 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_L2_HIT        0x01 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNP_NONE      0x01 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_MISS    0x01 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_HIT     0x01 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_HITM          0x01 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_NON_DRAM      0x01 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_AVG_LAT       0x01 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY           0x01 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_L2_HIT        0x01 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNP_NONE      0x01 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_MISS    0x01 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_HIT     0x01 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_HITM          0x01 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_NON_DRAM      0x01 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_AVG_LAT       0x01 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_ANY           0x01 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_L2_HIT        0x01 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNP_NONE      0x01 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_MISS    0x01 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_HIT     0x01 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_HITM          0x01 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_NON_DRAM      0x01 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_AVG_LAT       0x01 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_ANY           0x01 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_L2_HIT        0x01 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNP_NONE      0x01 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_MISS    0x01 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_HIT     0x01 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_HITM          0x01 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_NON_DRAM      0x01 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_AVG_LAT       0x01 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_0_ANY_ANY           0x01 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_0_ANY_L2_HIT        0x01 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_0_ANY_SNP_NONE      0x01 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_MISS    0x01 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_HIT     0x01 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_0_ANY_HITM          0x01 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_0_ANY_NON_DRAM      0x01 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_0_ANY_AVG_LAT       0x01 0x0F 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x02 0x00 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_L2_HIT        0x02 0x00 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNP_NONE      0x02 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_MISS    0x02 0x00 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_HIT     0x02 0x00 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_HITM          0x02 0x00 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_NON_DRAM      0x02 0x00 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_AVG_LAT       0x02 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY           0x02 0x01 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_L2_HIT        0x02 0x01 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNP_NONE      0x02 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_MISS    0x02 0x01 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_HIT     0x02 0x01 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_HITM          0x02 0x01 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_NON_DRAM      0x02 0x01 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_AVG_LAT       0x02 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_ANY           0x02 0x02 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_L2_HIT        0x02 0x02 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNP_NONE      0x02 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_MISS    0x02 0x02 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_HIT     0x02 0x02 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_HITM          0x02 0x02 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_NON_DRAM      0x02 0x02 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_AVG_LAT       0x02 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_1_WB_ANY           0x02 0x03 0x10
-UMASK_OFFCORE_RESPONSE_1_WB_L2_HIT        0x02 0x03 0x12
-UMASK_OFFCORE_RESPONSE_1_WB_SNP_NONE      0x02 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_MISS    0x02 0x03 0x21
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_HIT     0x02 0x03 0x22
-UMASK_OFFCORE_RESPONSE_1_WB_HITM          0x02 0x03 0x24
-UMASK_OFFCORE_RESPONSE_1_WB_NON_DRAM      0x02 0x03 0x25
-UMASK_OFFCORE_RESPONSE_1_WB_AVG_LAT       0x02 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_ANY           0x02 0x04 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_L2_HIT        0x02 0x04 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNP_NONE      0x02 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_MISS    0x02 0x04 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_HIT     0x02 0x04 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_HITM          0x02 0x04 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_NON_DRAM      0x02 0x04 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_AVG_LAT       0x02 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_ANY           0x02 0x05 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_L2_HIT        0x02 0x05 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNP_NONE      0x02 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_MISS    0x02 0x05 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_HIT     0x02 0x05 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_HITM          0x02 0x05 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_NON_DRAM      0x02 0x05 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_AVG_LAT       0x02 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_ANY           0x02 0x06 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_L2_HIT        0x02 0x06 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNP_NONE      0x02 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_MISS    0x02 0x06 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_HIT     0x02 0x06 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_HITM          0x02 0x06 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_NON_DRAM      0x02 0x06 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_AVG_LAT       0x02 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY           0x02 0x07 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_L2_HIT        0x02 0x07 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNP_NONE      0x02 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_MISS    0x02 0x07 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_HIT     0x02 0x07 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_HITM          0x02 0x07 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_NON_DRAM      0x02 0x07 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_AVG_LAT       0x02 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY           0x02 0x08 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_L2_HIT        0x02 0x08 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNP_NONE      0x02 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_MISS    0x02 0x08 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_HIT     0x02 0x08 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_HITM          0x02 0x08 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_NON_DRAM      0x02 0x08 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_AVG_LAT       0x02 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_ANY           0x02 0x09 0x10
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_L2_HIT        0x02 0x09 0x12
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNP_NONE      0x02 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_MISS    0x02 0x09 0x21
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_HIT     0x02 0x09 0x22
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_HITM          0x02 0x09 0x24
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_NON_DRAM      0x02 0x09 0x25
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_AVG_LAT       0x02 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_1 BUS_LOCKS_ANY           0x02 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_L2_HIT        0x02 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNP_NONE      0x02 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_MISS    0x02 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_HIT     0x02 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_HITM          0x02 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_NON_DRAM      0x02 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_AVG_LAT       0x02 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_ANY           0x02 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_L2_HIT        0x02 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNP_NONE      0x02 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_MISS    0x02 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_HIT     0x02 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_HITM          0x02 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_NON_DRAM      0x02 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_AVG_LAT       0x02 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY           0x02 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_L2_HIT        0x02 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNP_NONE      0x02 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_MISS    0x02 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_HIT     0x02 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_HITM          0x02 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_NON_DRAM      0x02 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_AVG_LAT       0x02 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_ANY           0x02 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_L2_HIT        0x02 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNP_NONE      0x02 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_MISS    0x02 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_HIT     0x02 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_HITM          0x02 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_NON_DRAM      0x02 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_AVG_LAT       0x02 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_ANY           0x02 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_L2_HIT        0x02 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNP_NONE      0x02 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_MISS    0x02 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_HIT     0x02 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_HITM          0x02 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_NON_DRAM      0x02 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_AVG_LAT       0x02 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_1_ANY_ANY           0x02 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_1_ANY_L2_HIT        0x02 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_1_ANY_SNP_NONE      0x02 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_MISS    0x02 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_HIT     0x02 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_1_ANY_HITM          0x02 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_1_ANY_NON_DRAM      0x02 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_1_ANY_AVG_LAT       0x02 0x0F 0x26
-
-
 EVENT_INST_RETIRED                  0xC0 PMC
 UMASK_INST_RETIRED_ANY_P            0x00
 
@@ -390,32 +104,33 @@ UMASK_UOPS_RETIRED_MS               0x01
 UMASK_UOPS_RETIRED_ALL              0x10
 
 EVENT_MACHINE_CLEARS                0xC3 PMC
-UMASK_MACHINE_CLEARS_SMC            0x01
-UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
-UMASK_MACHINE_CLEARS_FP_ASSIST      0x04
-UMASK_MACHINE_CLEARS_ALL            0x08
-
-EVENT_BR_INST_RETIRED               0xC4  PMC
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_INST_RETIRED_JCC           0x7E
-UMASK_BR_INST_RETIRED_FAR_BRANCH    0xBF
+UMASK_MACHINE_CLEARS_SMC               0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING   0x02
+UMASK_MACHINE_CLEARS_FP_ASSIST         0x04
+UMASK_MACHINE_CLEARS_ALL               0x08
+
+
+EVENT_BR_INST_RETIRED                0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_INST_RETIRED_JCC            0x7E
+UMASK_BR_INST_RETIRED_TAKEN_JCC      0xFE
+UMASK_BR_INST_RETIRED_FAR_BRANCH     0xBF
 UMASK_BR_INST_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_INST_RETIRED_RETURN        0xF7
-UMASK_BR_INST_RETIRED_CALL          0xF9
-UMASK_BR_INST_RETIRED_IND_CALL      0xFB
-UMASK_BR_INST_RETIRED_REL_CALL      0xFD
-UMASK_BR_INST_RETIRED_TAKEN_JCC     0xFE
-
-EVENT_BR_MISP_RETIRED               0xC5  PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_MISP_RETIRED_JCC           0x7E
-UMASK_BR_MISP_RETIRED_FAR_BRANCH    0xBF
+UMASK_BR_INST_RETIRED_RETURN         0xF7
+UMASK_BR_INST_RETIRED_CALL           0xF9
+UMASK_BR_INST_RETIRED_IND_CALL       0xFB
+UMASK_BR_INST_RETIRED_REL_CALL       0xFD
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_JCC            0x7E
+UMASK_BR_MISP_RETIRED_TAKEN_JCC      0xFE
+UMASK_BR_MISP_RETIRED_FAR_BRANCH     0xBF
 UMASK_BR_MISP_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_MISP_RETIRED_RETURN        0xF7
-UMASK_BR_MISP_RETIRED_CALL          0xF9
-UMASK_BR_MISP_RETIRED_IND_CALL      0xFB
-UMASK_BR_MISP_RETIRED_REL_CALL      0xFD
-UMASK_BR_MISP_RETIRED_TAKEN_JCC     0xFE
+UMASK_BR_MISP_RETIRED_RETURN         0xF7
+UMASK_BR_MISP_RETIRED_CALL           0xF9
+UMASK_BR_MISP_RETIRED_IND_CALL       0xFB
+UMASK_BR_MISP_RETIRED_REL_CALL       0xFD
 
 EVENT_NO_ALLOC_CYCLES               0xCA PMC
 UMASK_NO_ALLOC_CYCLES_ROB_FULL      0x01
@@ -430,7 +145,7 @@ UMASK_RS_FULL_STALL_ALL             0x1F
 EVENT_CYCLES_DIV_BUSY               0xCD PMC
 UMASK_CYCLES_DIV_BUSY_ANY           0x01
 
-EVENT_BACLEARS                      0xE6 PMC
+EVENT_BACLEARS                      0xE6  PMC
 UMASK_BACLEARS_ALL                  0x01
 UMASK_BACLEARS_RETURN               0x08
 UMASK_BACLEARS_COND                 0x10
@@ -438,3 +153,46 @@ UMASK_BACLEARS_COND                 0x10
 EVENT_MS_DECODED                    0xE7 PMC
 UMASK_MS_DECODED_MS_ENTRY           0x01
 
+EVENT_OFFCORE_RESPONSE_0              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY           0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY          0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_UC_CODE_RD_ANY             0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY              0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY            0x01 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L1_DATA_RD_ANY          0x01 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STREAMING_STORES_ANY 0x01 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_0_ANY_ANY                    0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x02 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x02 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x02 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x02 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x02 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x02 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x02 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x02 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY           0x02 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY          0x02 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_UC_CODE_RD_ANY             0x02 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY              0x02 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x02 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY            0x02 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L1_DATA_RD_ANY          0x02 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STREAMING_STORES_ANY 0x02 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_1_ANY_ANY                    0x02 0x0F 0x10
+
+
+
+
diff --git a/src/includes/perfmon_skylake.h b/src/includes/perfmon_skylake.h
new file mode 100644
index 0000000..1a10dc4
--- /dev/null
+++ b/src/includes/perfmon_skylake.h
@@ -0,0 +1,753 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_skylake.h
+ *
+ *      Description:  Header File of perfmon module for Intel Skylake.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_skylake_events.h>
+#include <perfmon_skylake_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+static int perfmon_numCountersSkylake = NUM_COUNTERS_SKYLAKE;
+static int perfmon_numCoreCountersSkylake = NUM_COUNTERS_CORE_SKYLAKE;
+static int perfmon_numArchEventsSkylake = NUM_ARCH_EVENTS_SKYLAKE;
+
+int perfmon_init_skylake(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+uint32_t skl_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int skl_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
+    uint64_t latency_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_IN_TRANS:
+                    flags |= (1ULL<<32);
+                    break;
+                case EVENT_OPTION_IN_TRANS_ABORT:
+                    flags |= (1ULL<<33);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<< 16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int skl_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int perfmon_setupCounterThread_skylake(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                skl_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= skl_fixed_setup(cpu_id, index, event);
+                break;
+
+            case POWER:
+                break;
+            case UBOXFIX:
+                if (haveLock)
+                {
+                    uint64_t uflags = 0x0ULL;
+                    uflags |= (1ULL<<20)|(1ULL<<22);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, SETUP_UBOXFIX)
+                    HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags);
+                }
+                break;
+            case UBOX:
+                if (haveLock)
+                {
+                    uint64_t uflags = 0x0ULL;
+                    uflags |= (1ULL<<20)|(1ULL<<22);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, uflags, CLEAR_UBOX)
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, uflags));
+                }
+                break;
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+                skl_cbox_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+    }
+    if ((fixed_flags > 0x0ULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
+
+int perfmon_startCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t uflags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+
+            PciDeviceIndex dev = counter_map[index].device;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
+
+                case POWER:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case UBOXFIX:
+                    if (haveLock)
+                    {
+                        VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOXFIX)
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    }
+                    break;
+                case UBOX:
+                    if (haveLock)
+                    {
+                        VERBOSEPRINTREG(cpu_id, counter1, 0x0ULL, CLEAR_UBOX)
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    }
+                    break;
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                    if (haveLock)
+                    {
+                        uflags |= (1ULL<<(type-CBOX0));
+                    }
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+        }
+    }
+
+    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29), UNFREEZE_UBOXFIX)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags|(1ULL<<29)));
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+
+#define SKL_CHECK_CORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+    }
+
+#define SKL_CHECK_UNCORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+    }
+
+#define SKL_CHECK_LOCAL_OVERFLOW \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        uint64_t offset = getCounterTypeOffset(eventSet->events[i].index); \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, (1ULL<<offset))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UBOXFIX)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    SKL_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+                
+                case UBOXFIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, 44);
+                    }
+                    break;
+                case UBOX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, 44);
+                    }
+                    break;
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
+        if (counter_result != 0x0ULL)
+        {
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
+        }
+    }
+    
+
+    return 0;
+}
+
+
+int perfmon_readCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t uflags = 0x0ULL;
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+    }
+
+    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, &uflags));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, LLU_CAST uflags, SAFE_UBOXFIX_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, 0x0ULL, RESET_UBOXFIX_FLAGS)
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            counter_result= 0x0ULL;
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    SKL_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    SKL_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+                
+                case UBOXFIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case UBOX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SKL_CHECK_UNCORE_OVERFLOW(box_map[type].ovflOffset);
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+    if ((haveLock) && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, &counter_result));
+        if (counter_result != 0x0ULL)
+        {
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_STATUS, counter_result));
+        }
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags));
+        VERBOSEPRINTREG(cpu_id, MSR_V4_UNC_PERF_GLOBAL_CTRL, uflags, RESET_UBOXFIX_FLAGS)
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int perfmon_finalizeCountersThread_skylake(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int clearPBS = 0;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+    uint64_t ovf_values_UBOXFIX = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                /*if (counter_map[index].type > UBOXFIX)
+                {
+                    if (box_map[counter_map[index].type].ovflOffset >= 0)
+                    {
+                        ovf_values_UBOXFIX |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
+                    }
+                }*/
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UBOXFIX) && (haveLock))))
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_UBOXFIX));
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_UBOXFIX, SHOW_CTL);
+            ovf_values_UBOXFIX = 0x0ULL;
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            if ((type >= SBOX0) && (type <= SBOX3))
+            {
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_UBOXFIX, CLEAR_UBOXFIX_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_UBOXFIX));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UBOXFIX_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_skylake_counters.h b/src/includes/perfmon_skylake_counters.h
new file mode 100644
index 0000000..9b0e2c7
--- /dev/null
+++ b/src/includes/perfmon_skylake_counters.h
@@ -0,0 +1,84 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_skylake_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Intel Skylake.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_SKYLAKE 24
+#define NUM_COUNTERS_CORE_SKYLAKE 8
+#define NUM_COUNTERS_UNCORE_SKYLAKE 24
+
+#define SKL_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define SKL_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap skylake_counter_map[NUM_COUNTERS_SKYLAKE] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR4", PMC12, POWER, 0, MSR_PLATFORM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    /* Test */
+    {"UBOXFIX", PMC13, UBOXFIX, MSR_UNC_PERF_FIXED_CTRL, MSR_UNC_PERF_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX0", PMC14, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL0, MSR_V4_ARB_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX1", PMC15, UBOX, MSR_V4_ARB_PERF_FIXED_CTRL1, MSR_V4_ARB_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC16, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL0, MSR_V4_C0_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C1", PMC17, CBOX0, MSR_V4_C0_PERF_FIXED_CTRL1, MSR_V4_C0_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX1C0", PMC18, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL0, MSR_V4_C1_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX1C1", PMC19, CBOX1, MSR_V4_C1_PERF_FIXED_CTRL1, MSR_V4_C1_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX2C0", PMC20, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL0, MSR_V4_C2_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX2C1", PMC21, CBOX2, MSR_V4_C2_PERF_FIXED_CTRL1, MSR_V4_C2_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX3C0", PMC22, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL0, MSR_V4_C3_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX3C1", PMC23, CBOX3, MSR_V4_C3_PERF_FIXED_CTRL1, MSR_V4_C3_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap skylake_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_V4_PERF_GLOBAL_STATUS, MSR_V4_PERF_GLOBAL_STATUS_RESET, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    [UBOXFIX] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 0, 0, 0, 44},
+    [UBOX] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 1, 0, 0, 44},
+    [CBOX0] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+    [CBOX1] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+    [CBOX2] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+    [CBOX3] = {MSR_V4_UNC_PERF_GLOBAL_CTRL, MSR_V4_UNC_PERF_GLOBAL_STATUS, MSR_V4_UNC_PERF_GLOBAL_STATUS, 3, 0, 0, 44},
+};
diff --git a/src/includes/perfmon_skylake_events.txt b/src/includes/perfmon_skylake_events.txt
new file mode 100644
index 0000000..9ce3b9a
--- /dev/null
+++ b/src/includes/perfmon_skylake_events.txt
@@ -0,0 +1,599 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_skylake_events.txt
+#
+#      Description:  Event list for Intel Skylake
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_PWR_PLATFORM_ENERGY          0x00   PWR4
+UMASK_PWR_PLATFORM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_ICACHE_16B_IFDATA_STALL    0x80 PMC
+UMASK_ICACHE_16B_IFDATA_STALL    0x04
+
+EVENT_ICACHE_64B_IFTAG           0x83 PMC
+UMASK_ICACHE_64B_IFTAG_HIT       0x01
+UMASK_ICACHE_64B_IFTAG_MISS      0x02
+UMASK_ICACHE_64B_IFTAG_ALL       0x03
+UMASK_ICACHE_64B_IFTAG_STALL     0x04
+
+EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY     0x01
+UMASK_CPU_CLOCK_THREAD_UNHALTED_ONE_THREAD_ACTIVE 0x02
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
+
+EVENT_BACLEARS                      0xE6 PMC
+UMASK_BACLEARS_ANY                  0x01
+
+EVENT_ITLB_FLUSH                    0xAE PMC
+UMASK_ITLB_FLUSH                    0x01
+
+EVENT_LSD_UOPS                      0xA8 PMC
+UMASK_LSD_UOPS                      0x01
+
+EVENT_ILD_STALL_LCP                 0x87 PMC
+UMASK_ILD_STALL_LCP                 0x01
+
+EVENT_IDQ                           0x79 PMC
+UMASK_IDQ_MITE_UOPS                 0x04
+UMASK_IDQ_DSB_UOPS                  0x08
+UMASK_IDQ_MS_MITE_UOPS              0x20
+
+EVENT_IDQ                           0x79 PMC
+DEFAULT_OPTIONS_IDQ_MS_CYCLES       EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_CYCLES                 0x30
+DEFAULT_OPTIONS_IDQ_MITE_CYCLES     EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MITE_CYCLES               0x04
+DEFAULT_OPTIONS_IDQ_DSB_CYCLES      EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_DSB_CYCLES                0x08
+DEFAULT_OPTIONS_IDQ_MS_DSB_CYCLES   EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_MS_DSB_CYCLES             0x10
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS   EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS     0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS   EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS   0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS   EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS    0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS   EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS  0x24
+
+EVENT_IDQ_ALL_MITE_CYCLES_ANY_UOPS      0x9C PMC
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE EVENT_OPTION_THRESHOLD=0x4
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_0_UOPS_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x3
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_1_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x2
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_2_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE EVENT_OPTION_THRESHOLD=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_LE_3_UOP_DELIV_CORE 0x01
+DEFAULT_OPTIONS_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=0x1
+UMASK_IDQ_UOPS_NOT_DELIVERED_CYCLES_FE_WAS_OK 0x01
+
+EVENT_DSB2MITE_SWITCHES_PENALTY_CYCLES 0xAB PMC
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES 0x02
+
+EVENT_INT_MISC                          0x0D PMC
+UMASK_INT_MISC_RECOVERY_CYCLES          0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT           0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_ANYTHREAD=0x1
+UMASK_INT_MISC_RECOVERY_CYCLES_ANY      0x01
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES_ANY EVENT_OPTION_ANYTHREAD=0x1,EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_RECOVERY_COUNT_ANY       0x01
+UMASK_INT_MISC_CLEAR_RESTEER_CYCLES     0x80
+DEFAULT_OPTIONS_INT_MISC_CLEAR_RESTEER_COUNT EVENT_OPTION_EDGE=1
+UMASK_INT_MISC_CLEAR_RESTEER_COUNT      0x80
+
+
+EVENT_RESOURCE_STALLS                   0xA2 PMC
+UMASK_RESOURCE_STALLS_ANY               0x01
+UMASK_RESOURCE_STALLS_SB                0x08
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_VECTOR_WIDTH_MISMATCH 0x02
+UMASK_UOPS_ISSUED_SLOW_LEA       0x20
+DEFAULT_OPTIONS_UOPS_ISSUED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_USED_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_STALL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_ISSUED_CORE_TOTAL_CYCLES   0x01
+
+
+EVENT_TX_EXEC                           0x5D PMC
+UMASK_TX_EXEC_MISC1                     0x01
+UMASK_TX_EXEC_MISC2                     0x02
+UMASK_TX_EXEC_MISC3                     0x04
+UMASK_TX_EXEC_MISC4                     0x08
+UMASK_TX_EXEC_MISC5                     0x10
+
+EVENT_RS_EVENTS_EMPTY                   0x5E PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES            0x01
+DEFAULT_OPTIONS_RS_EVENTS_EMPTY_END     EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_RS_EVENTS_EMPTY_END               0x01
+
+EVENT_HLE_RETIRED                       0xC8 PMC
+UMASK_HLE_RETIRED_START                 0x01
+UMASK_HLE_RETIRED_COMMIT                0x02
+UMASK_HLE_RETIRED_ABORTED               0x04
+UMASK_HLE_RETIRED_ABORTED_MEM           0x08
+UMASK_HLE_RETIRED_ABORTED_TIMER         0x10
+UMASK_HLE_RETIRED_ABORTED_UNFRIENDLY    0x20
+UMASK_HLE_RETIRED_ABORTED_MEMTYPE       0x40
+UMASK_HLE_RETIRED_ABORTED_EVENTS        0x80
+
+EVENT_RTM_RETIRED                       0xC9 PMC
+UMASK_RTM_RETIRED_START                 0x01
+UMASK_RTM_RETIRED_COMMIT                0x02
+UMASK_RTM_RETIRED_ABORTED               0x04
+UMASK_RTM_RETIRED_ABORTED_MEM           0x08
+UMASK_RTM_RETIRED_ABORTED_TIMER         0x10
+UMASK_RTM_RETIRED_ABORTED_UNFRIENDLY    0x20
+UMASK_RTM_RETIRED_ABORTED_MEMTYPE       0x40
+UMASK_RTM_RETIRED_ABORTED_EVENTS        0x80
+
+EVENT_MACHINE_CLEARS                    0xC3 PMC
+DEFAULT_OPTIONS_MACHINE_CLEARS_COUNT    EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_MACHINE_CLEARS_COUNT              0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+
+EVENT_HW_INTERRUPTS_RECEIVED            0xCB PMC
+UMASK_HW_INTERRUPTS_RECEIVED            0x01
+
+EVENT_INST_RETIRED                      0xC0 PMC
+UMASK_INST_RETIRED_ANY                  0x00
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_ALL         0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS__UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_USED_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_RETIRED_CORE_TOTAL_CYCLES     0x01
+
+EVENT_BR_INST_RETIRED                   0xC4 PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES      0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL       0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL         0x02
+UMASK_BR_INST_RETIRED_NEAR_RETURN       0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN         0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN        0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH        0x40
+
+EVENT_BR_MISP_RETIRED                   0xC5 PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES      0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL       0x01
+UMASK_BR_MISP_RETIRED_NEAR_TAKEN        0x20
+
+EVENT_FP_ARITH_INST_RETIRED                     0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE       0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE       0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE  0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE  0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE  0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE  0x20
+
+EVENT_FP_ASSIST_ANY                     0xCA PMC
+DEFAULT_OPTIONS_FP_ASSIST_ANY           EVENT_OPTION_THRESHOLD=0x1
+UMASK_FP_ASSIST_ANY                     0x1E
+
+EVENT_MEM_INST_RETIRED                  0xD0 PMC
+UMASK_MEM_INST_RETIRED_STLB_MISS_LOADS  0x11
+UMASK_MEM_INST_RETIRED_STLB_MISS_STORES 0x12
+UMASK_MEM_INST_RETIRED_LOCK_LOADS       0x21
+UMASK_MEM_INST_RETIRED_SPLIT_LOADS      0x41
+UMASK_MEM_INST_RETIRED_SPLIT_STORES     0x42
+UMASK_MEM_INST_RETIRED_ALL_LOADS        0x81
+UMASK_MEM_INST_RETIRED_ALL_STORES       0x82
+UMASK_MEM_INST_RETIRED_ALL              0x83
+
+EVENT_MEM_LOAD_RETIRED                  0xD1 PMC
+UMASK_MEM_LOAD_RETIRED_L1_HIT           0x01
+UMASK_MEM_LOAD_RETIRED_L2_HIT           0x02
+UMASK_MEM_LOAD_RETIRED_L3_HIT           0x04
+UMASK_MEM_LOAD_RETIRED_L1_MISS          0x08
+UMASK_MEM_LOAD_RETIRED_L2_MISS          0x10
+UMASK_MEM_LOAD_RETIRED_L3_MISS          0x20
+UMASK_MEM_LOAD_RETIRED_FB_HIT           0x40
+
+EVENT_MEM_LOAD_L3_HIT_RETIRED           0xD2 PMC
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_HIT  0x02
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_FRONTEND_RETIRED                  0xC6 PMC
+UMASK_FRONTEND_RETIRED_DSB_MISS         0x01 0x00 0x11
+UMASK_FRONTEND_RETIRED_L1I_MISS         0x01 0x00 0x12
+UMASK_FRONTEND_RETIRED_L2_MISS          0x01 0x00 0x13
+UMASK_FRONTEND_RETIRED_ITLB_MISS        0x01 0x00 0x14
+UMASK_FRONTEND_RETIRED_STLB_MISS        0x01 0x00 0x15
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2     0x01 0x00 0x400206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_2 0x01 0x00 0x200206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_4     0x01 0x00 0x400406
+
+EVENT_UOPS_EXECUTED                       0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD                0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_USED_CYCLES           0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_STALL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_TOTAL_CYCLES          0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_NONE_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CYCLES_NONE_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CYCLES_GE_1_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CYCLES_GE_2_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CYCLES_GE_3_UOPS_EXEC 0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CYCLES_GE_4_UOPS_EXEC 0x01
+UMASK_UOPS_EXECUTED_CORE                  0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_USED_CYCLES           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_TOTAL_CYCLES          0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_NONE_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_NONE_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_1_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x2
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_2_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x3
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_3_UOPS_EXEC 0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x4
+UMASK_UOPS_EXECUTED_CORE_CYCLES_GE_4_UOPS_EXEC 0x02
+UMASK_UOPS_EXECUTED_X87                 0x10
+
+
+EVENT_EXE_ACTIVITY                      0xA6 PMC
+UMASK_EXE_ACTIVITY_EXE_BOUND_0_PORTS    0x01
+UMASK_EXE_ACTIVITY_1_PORTS_UTIL         0x02
+UMASK_EXE_ACTIVITY_2_PORTS_UTIL         0x04
+UMASK_EXE_ACTIVITY_3_PORTS_UTIL         0x08
+UMASK_EXE_ACTIVITY_4_PORTS_UTIL         0x10
+UMASK_EXE_ACTIVITY_BOUND_ON_STORES      0x40
+
+EVENT_UOPS_DISPATCHED_PORT              0xA1 PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0       0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1       0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2       0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_3       0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_4       0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_5       0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_6       0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_7       0x80
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS      0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_ARITH_PORTS_CORE    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_ARITH_PORTS_CORE 0x63
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT_DATA_PORTS    EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_DISPATCHED_PORT_DATA_PORTS       0x9C
+
+EVENT_CYCLE_ACTIVITY_STALLS_TOTAL       0xA3 PMC
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_TOTAL EVENT_OPTION_THRESHOLD=0x4
+UMASK_CYCLE_ACTIVITY_STALLS_TOTAL       0x04
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0x8
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0x08
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L1D_MISS EVENT_OPTION_THRESHOLD=0xC
+UMASK_CYCLE_ACTIVITY_CYCLES_L1D_MISS    0x0C
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L2_MISS EVENT_OPTION_THRESHOLD=0x1
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_MISS     0x01
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L2_MISS EVENT_OPTION_THRESHOLD=0x5
+UMASK_CYCLE_ACTIVITY_STALLS_L2_MISS     0x05
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_L3_MISS EVENT_OPTION_THRESHOLD=0x2
+UMASK_CYCLE_ACTIVITY_CYCLES_L3_MISS     0x02
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_L3_MISS EVENT_OPTION_THRESHOLD=0x6
+UMASK_CYCLE_ACTIVITY_STALLS_L3_MISS     0x06
+
+
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_CYCLES_MEM_ANY EVENT_OPTION_THRESHOLD=0x10
+UMASK_CYCLE_ACTIVITY_CYCLES_MEM_ANY     0x10
+DEFAULT_OPTIONS_CYCLE_ACTIVITY_STALLS_MEM_ANY EVENT_OPTION_THRESHOLD=0x14
+UMASK_CYCLE_ACTIVITY_STALLS_MEM_ANY     0x14
+
+
+EVENT_EPT_WALK_PENDING                  0x4F PMC
+UMASK_EPT_WALK_PENDING                  0x10
+
+EVENT_ITLB_MISSES                       0x85 PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK         0x01
+UMASK_ITLB_MISSES_WALK_PENDING          0x10
+UMASK_ITLB_MISSES_STLB_HIT              0x20
+UMASK_ITLB_MISSES_WALK_COMPLETED        0x0E
+DEFAULT_OPTIONS_ITLB_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_ITLB_MISSES_WALK_ACTIVE           0x10
+
+EVENT_DTLB_LOAD_MISSES                  0x08 PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK    0x01
+UMASK_DTLB_LOAD_MISSES_WALK_PENDING     0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT         0x20
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED   0x0E
+DEFAULT_OPTIONS_DTLB_LOAD_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_DTLB_LOAD_MISSES_WALK_ACTIVE      0x10
+
+EVENT_DTLB_STORE_MISSES                 0x49 PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_PENDING    0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT        0x20
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED  0x0E
+DEFAULT_OPTIONS_DTLB_STORE_MISSES_WALK_ACTIVE EVENT_OPTION_THRESHOLD=0x1
+UMASK_DTLB_STORE_MISSES_WALK_ACTIVE     0x10
+
+EVENT_TLB_FLUSH                         0xBD PMC
+UMASK_TLB_FLUSH_DTLB_THREAD             0x01
+UMASK_TLB_FLUSH_STLB_ANY                0x20
+
+EVENT_L1D                               0x51 PMC
+UMASK_L1D_REPLACEMENT                   0x01
+UMASK_L1D_M_EVICT                       0x04
+
+EVENT_TX_MEM                            0x54 PMC
+UMASK_TX_MEM_ABORT_CONFLICT             0x01
+UMASK_TX_MEM_ABORT_CAPACITY             0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK 0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY 0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH 0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT 0x20
+UMASK_TX_MEM_HLE_ELISION_BUFFER_FULL    0x40
+
+EVENT_L1D_PEND_MISS                     0x48 PMC
+UMASK_L1D_PEND_MISS_PENDING             0x01
+UMASK_L1D_PEND_MISS_FB_FULL             0x02
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES      0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES_ANY EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=0x1
+UMASK_L1D_PEND_MISS_PENDING_CYCLES_ANY  0x01
+
+EVENT_LOAD_HIT_PRE_SW_PF                0x4C PMC
+UMASK_LOAD_HIT_PRE_SW_PF                0x01
+
+EVENT_LOCK_CYCLES_CACHE_LOCK_DURATION   0x63 PMC
+UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION   0x02
+DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=1
+UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT      0x02
+
+EVENT_LD_BLOCKS                         0x03 PMC
+UMASK_LD_BLOCKS_STORE_FORWARD           0x02
+UMASK_LD_BLOCKS_NO_SR                   0x08
+
+EVENT_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x07 PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_OFFCORE_REQUESTS                  0xB0 PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+UMASK_OFFCORE_REQUESTS_L3_MISS_DEMAND_DATA_RD 0x10
+UMASK_OFFCORE_REQUESTS_ALL_REQUESTS     0x80
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING      0x60 PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD_GE_6 0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD 0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO 0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD 0x08
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD 0x10
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_DATA_RD 0x01
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DATA_RD 0x08
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_CODE_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_CODE_RD 0x02
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_RFO EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_DEMAND_RFO 0x04
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD EVENT_OPTION_THRESHOLD=0x1
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD 0x10
+DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 0x10
+
+
+EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0xB2 PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0x01
+
+EVENT_L2_TRANS                          0xF0 PMC
+UMASK_L2_TRANS_L2_WB                    0x40
+UMASK_L2_TRANS_ALL_REQUESTS             0x80
+
+EVENT_LONGEST_LAT_CACHE                 0x2E PMC
+UMASK_LONGEST_LAT_CACHE_MISS            0x41
+UMASK_LONGEST_LAT_CACHE_REFERENCE       0x4F
+
+
+EVENT_L2_RQSTS                          0x24 PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS      0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT       0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD       0xE1
+UMASK_L2_RQSTS_ALL_RFO                  0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD              0xE4
+UMASK_L2_RQSTS_ALL_PF                   0xF8
+UMASK_L2_RQSTS_PF_MISS                  0x38
+UMASK_L2_RQSTS_PF_HIT                   0xD8
+UMASK_L2_RQSTS_RFO_HIT                  0x42
+UMASK_L2_RQSTS_RFO_MISS                 0x22
+UMASK_L2_RQSTS_CODE_RD_HIT              0x44
+UMASK_L2_RQSTS_CODE_RD_MISS             0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS          0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES    0xE7
+UMASK_L2_RQSTS_MISS                     0x3F
+UMASK_L2_RQSTS_REFERENCES               0xFF
+
+EVENT_IDQ_MS                            0x79 PMC
+UMASK_IDQ_MS_UOPS                       0x30
+DEFAULT_OPTIONS_IDQ_MS_SWITCHES         EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=0x1
+UMASK_IDQ_MS_SWITCHES                   0x30
+
+EVENT_L2_LINES                          0xF1 PMC
+UMASK_L2_LINES_IN_ALL                   0x07
+
+EVENT_ARITH_DIVIDER_ACTIVE              0x14 PMC
+UMASK_ARITH_DIVIDER_ACTIVE              0x01
+DEFAULT_OPTIONS_ARITH_DIVIDER_COUNT     EVENT_OPTION_EDGE=0x1
+UMASK_ARITH_DIVIDER_COUNT               0x01
+
+EVENT_LSD_CYCLES                        0xA8 PMC
+DEFAULT_OPTIONS_LSD_CYCLES_ACTIVE       EVENT_OPTION_THRESHOLD=0x1
+UMASK_LSD_CYCLES_ACTIVE                 0x01
+DEFAULT_OPTIONS_LSD_CYCLES_4_UOPS       EVENT_OPTION_THRESHOLD=0x4
+UMASK_LSD_CYCLES_4_UOPS                 0x01
+
+EVENT_OTHER_ASSISTS_ANY                 0xC1 PMC
+UMASK_OTHER_ASSISTS_ANY                 0x3F
+
+EVENT_FRONTEND_RETIRED_LATENCY          0xC6 PMC
+UMASK_FRONTEND_RETIRED_LATENCY_GE_8     0x01 0x00 0x400806
+UMASK_FRONTEND_RETIRED_LATENCY_GE_16    0x01 0x00 0x401006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_32    0x01 0x00 0x402006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_64    0x01 0x00 0x404006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_128   0x01 0x00 0x408006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_256   0x01 0x00 0x410006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_512   0x01 0x00 0x420006
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_1 0x01 0x00 0x100206
+UMASK_FRONTEND_RETIRED_LATENCY_GE_2_BUBBLES_GE_3 0x01 0x00 0x300206
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_CACHE_LOOKUP                          0x34 CBOX
+UMASK_CACHE_LOOKUP_M                        0x01
+UMASK_CACHE_LOOKUP_E                        0x02
+UMASK_CACHE_LOOKUP_S                        0x04
+UMASK_CACHE_LOOKUP_I                        0x08
+UMASK_CACHE_LOOKUP_READ_FILTER              0x10
+UMASK_CACHE_LOOKUP_WRITE_FILTER             0x20
+UMASK_CACHE_LOOKUP_EXTSNP_FILTER            0x40
+UMASK_CACHE_LOOKUP_ANY_REQUEST_FILTER       0x80
+UMASK_CACHE_LOOKUP_READ_M                   0x11
+UMASK_CACHE_LOOKUP_WRITE_M                  0x21
+UMASK_CACHE_LOOKUP_EXTSNP_M                 0x41
+UMASK_CACHE_LOOKUP_ANY_M                    0x81
+UMASK_CACHE_LOOKUP_READ_E                   0x12
+UMASK_CACHE_LOOKUP_WRITE_E                  0x22
+UMASK_CACHE_LOOKUP_EXTSNP_E                 0x42
+UMASK_CACHE_LOOKUP_ANY_E                    0x82
+UMASK_CACHE_LOOKUP_READ_S                   0x14
+UMASK_CACHE_LOOKUP_WRITE_S                  0x24
+UMASK_CACHE_LOOKUP_EXTSNP_S                 0x44
+UMASK_CACHE_LOOKUP_ANY_S                    0x84
+UMASK_CACHE_LOOKUP_READ_ES                  0x16
+UMASK_CACHE_LOOKUP_WRITE_ES                 0x26
+UMASK_CACHE_LOOKUP_EXTSNP_ES                0x46
+UMASK_CACHE_LOOKUP_ANY_ES                   0x86
+UMASK_CACHE_LOOKUP_READ_I                   0x18
+UMASK_CACHE_LOOKUP_WRITE_I                  0x28
+UMASK_CACHE_LOOKUP_EXTSNP_I                 0x48
+UMASK_CACHE_LOOKUP_ANY_I                    0x88
+UMASK_CACHE_LOOKUP_READ_MESI                0x1F
+UMASK_CACHE_LOOKUP_WRITE_MESI               0x2F
+UMASK_CACHE_LOOKUP_EXTSNP_MESI              0x4F
+UMASK_CACHE_LOOKUP_ANY_MESI                 0x8F
+
+EVENT_XSNP_RESPONSE                         0x22 CBOX
+UMASK_XSNP_RESPONSE_MISS_EXTERNAL           0x21
+UMASK_XSNP_RESPONSE_MISS_XCORE              0x41
+UMASK_XSNP_RESPONSE_MISS_EVICTION           0x81
+UMASK_XSNP_RESPONSE_HIT_EXTERNAL            0x24
+UMASK_XSNP_RESPONSE_HIT_XCORE               0x44
+UMASK_XSNP_RESPONSE_HIT_EVICTION            0x84
+UMASK_XSNP_RESPONSE_HITM_EXTERNAL           0x28
+UMASK_XSNP_RESPONSE_HITM_XCORE              0x48
+UMASK_XSNP_RESPONSE_HITM_EVICTION           0x88
+
+EVENT_TRK_OCCUPANCY_ALL                     0x80 UBOX0
+UMASK_TRK_OCCUPANCY_ALL                     0x01
+
+EVENT_TRK_REQUESTS                          0x81 UBOX
+UMASK_TRK_REQUESTS_ALL                      0x01
+UMASK_TRK_REQUESTS_WRITES                   0x20
+
+EVENT_COH_TRK_OCCUPANCY                     0x83 UBOX0
+UMASK_COH_TRK_OCCUPANCY                     0x01
+
+EVENT_COH_TRK_REQUESTS                      0x84 UBOX
+UMASK_COH_TRK_REQUESTS_ALL                  0x01
+
+EVENT_UNCORE_CLOCK                          0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                          0x01
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index 1f0663a..c93874e 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -7,13 +7,14 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,145 +35,228 @@
 #define PERFMON_TYPES_H
 
 #include <bstrlib.h>
-#include <perfmon_group_types.h>
+#include <timer.h>
+#include <inttypes.h>
+#include <perfgroup.h>
+
+#define MAX_EVENT_OPTIONS NUM_EVENT_OPTIONS
 
 /* #####   EXPORTED TYPE DEFINITIONS   #################################### */
 
+/** \addtogroup PerfMon
+ *  @{
+ */
+/////////////////////////////////////////////
+
+/*! \brief Enum of possible event and counter options
+
+List of internally used IDs for all event and counter options that are supported
+by LIKWID.
+\extends PerfmonEventOption
+*/
 typedef enum {
-    PMC0 = 0,
-    PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
-    PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
-    PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
-    PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
-    PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
-    PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
-    PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
-    PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
-    PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
-    PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
-    PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
-    PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
-    PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
-    PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
-    PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
-    PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
-    PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
-    PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
-    NUM_PMC} PerfmonCounterIndex;
+    EVENT_OPTION_NONE = 0, /*!< \brief No option, used as False value */
+    EVENT_OPTION_OPCODE, /*!< \brief Match opcode */
+    EVENT_OPTION_MATCH0, /*!< \brief Match0 register */
+    EVENT_OPTION_MATCH1, /*!< \brief Match1 register */
+    EVENT_OPTION_MATCH2, /*!< \brief Match2 register */
+    EVENT_OPTION_MATCH3, /*!< \brief Match3 register */
+    EVENT_OPTION_MASK0, /*!< \brief Mask0 register */
+    EVENT_OPTION_MASK1, /*!< \brief Mask1 register */
+    EVENT_OPTION_MASK2, /*!< \brief Mask2 register */
+    EVENT_OPTION_MASK3, /*!< \brief Mask3 register */
+    EVENT_OPTION_NID, /*!< \brief Set NUMA node ID */
+    EVENT_OPTION_TID, /*!< \brief Set Thread ID */
+    EVENT_OPTION_STATE, /*!< \brief Match for state */
+    EVENT_OPTION_EDGE, /*!< \brief Increment counter at each edge */
+    EVENT_OPTION_THRESHOLD, /*!< \brief Increment only if exceeding threshold */
+    EVENT_OPTION_INVERT, /*!< \brief Invert behavior of EVENT_OPTION_THRESHOLD, hence increment only below threshold */
+    EVENT_OPTION_COUNT_KERNEL, /*!< \brief Also count events when in kernel space */
+    EVENT_OPTION_ANYTHREAD, /*!< \brief Increment counter at events of all HW threads in the core */
+    EVENT_OPTION_OCCUPANCY, /*!< \brief Count occupancy not occurrences */
+    EVENT_OPTION_OCCUPANCY_FILTER, /*!< \brief Filter for occupancy counting */
+    EVENT_OPTION_OCCUPANCY_EDGE, /*!< \brief Increment occupancy counter at detection of an edge */
+    EVENT_OPTION_OCCUPANCY_INVERT, /*!< \brief Invert filter for occupancy counting */
+    EVENT_OPTION_IN_TRANS, /*!< \brief Count events during transactions */
+    EVENT_OPTION_IN_TRANS_ABORT, /*!< \brief Count events that aborted during transactions */
+    NUM_EVENT_OPTIONS /*!< \brief Amount of defined options */
+} EventOptionType;
+
+/*! \brief Enum of possible states of an event group
 
+List of states for event groups
+*/
 typedef enum {
-    PMC = 0,
-    FIXED,
-    THERMAL,
-    UNCORE,
-    MBOX0,
-    MBOX1,
-    MBOX2,
-    MBOX3,
-    MBOXFIX,
-    BBOX0,
-    BBOX1,
-    RBOX0,
-    RBOX1,
-    WBOX,
-    SBOX0,
-    SBOX1,
-    SBOX2,
-    CBOX0,
-    CBOX1,
-    CBOX2,
-    CBOX3,
-    CBOX4,
-    CBOX5,
-    CBOX6,
-    CBOX7,
-    CBOX8,
-    CBOX9,
-    CBOX10,
-    CBOX11,
-    CBOX12,
-    CBOX13,
-    CBOX14,
-    PBOX,
-    POWER,
-    UBOX,
-    NUM_UNITS} PerfmonType;
+    STATE_NONE = 0, /*!< \brief Not configured, not started and not stopped */
+    STATE_SETUP, /*!< \brief The event set hold by group is configured */
+    STATE_START, /*!< \brief The event set hold by group is current running */
+} GroupState;
 
-typedef struct {
-    char* key;
-    PerfmonCounterIndex index;
-    PerfmonType type;
-    uint64_t configRegister;
-    uint64_t counterRegister;
-    uint64_t counterRegister2;
-    PciDeviceIndex device;
-} PerfmonCounterMap;
+/*! \brief List of option names
 
+List of strings for all event and counter options used for matching and output
+*/
+extern char* eventOptionTypeName[NUM_EVENT_OPTIONS];
+
+/** \brief Bitmask with no event/counter option set */
+#define EVENT_OPTION_NONE_MASK 0x0ULL
+/** \brief Define for easily creating an bitmask of all configured event/counter options */
+#define OPTIONS_TYPE_MASK(type) \
+        (((type == EVENT_OPTION_NONE)||(type >= NUM_EVENT_OPTIONS)) ? \
+        EVENT_OPTION_NONE_MASK : \
+        (1ULL<<type))
+
+
+/** @cond */ 
+#define EVENT_OPTION_OPCODE_MASK (1ULL<<EVENT_OPTION_OPCODE)
+#define EVENT_OPTION_MATCH0_MASK (1ULL<<EVENT_OPTION_MATCH0)
+#define EVENT_OPTION_MATCH1_MASK (1ULL<<EVENT_OPTION_MATCH1)
+#define EVENT_OPTION_MATCH2_MASK (1ULL<<EVENT_OPTION_MATCH2)
+#define EVENT_OPTION_MATCH3_MASK (1ULL<<EVENT_OPTION_MATCH3)
+#define EVENT_OPTION_MASK0_MASK (1ULL<<EVENT_OPTION_MASK0)
+#define EVENT_OPTION_MASK1_MASK (1ULL<<EVENT_OPTION_MASK1)
+#define EVENT_OPTION_MASK2_MASK (1ULL<<EVENT_OPTION_MASK2)
+#define EVENT_OPTION_MASK3_MASK (1ULL<<EVENT_OPTION_MASK3)
+#define EVENT_OPTION_NID_MASK (1ULL<<EVENT_OPTION_NID)
+#define EVENT_OPTION_TID_MASK (1ULL<<EVENT_OPTION_TID)
+#define EVENT_OPTION_STATE_MASK (1ULL<<EVENT_OPTION_STATE)
+#define EVENT_OPTION_EDGE_MASK (1ULL<<EVENT_OPTION_EDGE)
+#define EVENT_OPTION_THRESHOLD_MASK (1ULL<<EVENT_OPTION_THRESHOLD)
+#define EVENT_OPTION_INVERT_MASK (1ULL<<EVENT_OPTION_INVERT)
+#define EVENT_OPTION_COUNT_KERNEL_MASK (1ULL<<EVENT_OPTION_COUNT_KERNEL)
+#define EVENT_OPTION_ANYTHREAD_MASK (1ULL<<EVENT_OPTION_ANYTHREAD)
+#define EVENT_OPTION_OCCUPANCY_MASK (1ULL<<EVENT_OPTION_OCCUPANCY)
+#define EVENT_OPTION_OCCUPANCY_FILTER_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_FILTER)
+#define EVENT_OPTION_OCCUPANCY_EDGE_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_EDGE)
+#define EVENT_OPTION_OCCUPANCY_INVERT_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_INVERT)
+#define EVENT_OPTION_IN_TRANS_MASK (1ULL<<EVENT_OPTION_IN_TRANS)
+#define EVENT_OPTION_IN_TRANS_ABORT_MASK (1ULL<<EVENT_OPTION_IN_TRANS_ABORT)
+/** @endcond */
+
+/*! \brief Structure specifying thread to CPU relation
+
+Threads are always numbered incrementally. This structure is used in order to 
+resolve the real HW thread ID.
+\extends PerfmonGroupSet
+*/
 typedef struct {
-    const char* key;
-    PerfmonGroup index;
-    int isUncore;
-    const char* info;
-    const char* config;
-    int derivedCounters;
-    const char ** derivedCounterNames;
-} PerfmonGroupMap;
+    int             thread_id; /*!< \brief Thread ID how it is used internally */
+    int             processorId; /*!< \brief Real HW thread ID */
+} PerfmonThread;
 
+/*! \brief Structure specifying event/counter options and their value
+
+Most options set a bitfield in registers and their values are stored in this structure.
+If an option is a binary option, the value is set to 1.
+\extends PerfmonEvent
+*/
 typedef struct {
-    char* key;
-    char* msg;
-} PerfmonGroupHelp;
+    EventOptionType      type; /*!< \brief Type of the option */
+    uint64_t             value; /*!< \brief Value of the option */
+} PerfmonEventOption;
+
+/*! \brief Structure specifying an performance monitoring event
 
-/* only used in westmereEX at the moment */
+This structure holds the configuration data for an event. It groups the name,
+the allowed counters and internally used values like event ID and masks. Moreover
+the event options are hold here.
+\extends PerfmonEventSetEntry
+*/
 typedef struct {
-    uint32_t ctrlRegister;
-    uint32_t statusRegister;
-    uint32_t ovflRegister;
-} PerfmonUnit;
+    const char*     name; /*!< \brief Name of the event */
+    const char*     limit; /*!< \brief Valid counters for the event */
+    uint16_t        eventId; /*!< \brief ID of the event */
+    uint8_t         umask; /*!< \brief Most events need to specify a mask to limit counting */
+    uint8_t         cfgBits; /*!< \brief Misc configuration bits */
+    uint64_t        cmask; /*!< \brief Misc mask bits */
+    uint8_t         numberOfOptions; /*!< \brief Number of options for the event */
+    uint64_t        optionMask; /*!< \brief Bitmask for fast check of set options */
+    PerfmonEventOption options[NUM_EVENT_OPTIONS]; /*!< \brief List of options */
+} PerfmonEvent;
 
+/*! \brief Structure describing performance monitoring counter data
+
+Each event holds one of these structures for each thread to store the counter
+data, if it is configured and the amount of happened overflows.
+\extends PerfmonEventSetEntry
+*/
 typedef struct {
-    int init;
-    int id;  /* TODO id is only used for EX type processors */
-    double counterData;
+    int         init; /*!< \brief Flag if corresponding control register is set up properly */
+    int         id; /*!< \brief Offset in higher level control register, e.g. position of enable bit */
+    int         overflows; /*!< \brief Amount of overflows */
+    uint64_t    startData; /*!< \brief Start data from the counter */
+    uint64_t    counterData; /*!< \brief Intermediate data from the counters */
+    double      lastResult; /*!< \brief Last measurement result*/
+    double      fullResult; /*!< \brief Aggregated measurement result */
 } PerfmonCounter;
 
-typedef struct {
-    int processorId;
-    PerfmonCounter counters[NUM_PMC];
-} PerfmonThread;
 
-typedef struct {
-    const char* name;
-    const char* limit;
-    uint16_t eventId;
-    uint8_t umask;
-    uint8_t cfgBits;
-    uint8_t cmask;
-} PerfmonEvent;
+/*! \brief Structure specifying an performance monitoring event
 
+An eventSet consists of an event and a counter and the read counter values.
+\extends PerfmonEventSet
+*/
 typedef struct {
-    PerfmonEvent event;
-    PerfmonCounterIndex index;
-    double* result;
+    PerfmonEvent        event; /*!< \brief Event configuration */
+    RegisterIndex       index; /*!< \brief Index of the counter register in the counter map */
+    RegisterType        type; /*!< \brief Type of the counter register and event */
+    PerfmonCounter*     threadCounter; /*!< \brief List of counter data for each thread, list length is \a numberOfThreads in PerfmonGroupSet */
 } PerfmonEventSetEntry;
 
+/*! \brief Structure specifying an performance monitoring event group
+
+A PerfmonEventSet holds a set of event and counter combinations and some global information about all eventSet entries
+\extends PerfmonGroupSet
+*/
 typedef struct {
-    int numberOfEvents;
-    PerfmonEventSetEntry* events;
+    int                   numberOfEvents; /*!< \brief Number of eventSets in \a events */
+    PerfmonEventSetEntry* events; /*!< \brief List of eventSets */
+    TimerData             timer; /*!< \brief Time information how long the counters were running */
+    double                rdtscTime; /*!< \brief Evaluation of the Time information in seconds */
+    double                runTime; /*!< \brief Sum of all time information in seconds that the group was running */
+#ifdef __x86_64
+    __uint128_t           regTypeMask; /*!< \brief Bitmask for easy checks which types are included in the eventSet */
+#else
+    uint64_t              regTypeMask; /*!< \brief Bitmask for easy checks which types are included in the eventSet */
+#endif
+    GroupState            state; /*!< \brief Current state of the event group (configured, started, none) */
+    GroupInfo             group; /*!< \brief Structure holding the performance group information */
 } PerfmonEventSet;
 
+/*! \brief Structure specifying all performance monitoring event groups
 
+The global PerfmonGroupSet structure holds all eventSets and threads that are
+configured to measure. Only one eventSet can be measured at a time but the groups
+can be switched to perform some kind of multiplexing.
+*/
 typedef struct {
-    bstring label;
-    double* value;
-} PerfmonResult;
+    int              numberOfGroups; /*!< \brief List length of \a groups*/
+    int              numberOfActiveGroups; /*!< \brief Amount of added eventSets. Only those eventSets can be accessed in \a groups. */
+    int              activeGroup; /*!< \brief Currently active eventSet */
+    PerfmonEventSet* groups; /*!< \brief List of eventSets */
+    int              numberOfThreads; /*!< \brief Amount of threads in \a threads */
+    PerfmonThread*   threads; /*!< \brief List of threads */
+} PerfmonGroupSet;
 
-typedef struct {
-    bstrList* header;
-    int numRows;
-    int numColumns;
-    PerfmonResult* rows;
-} PerfmonResultTable;
+/** \brief List of counter with name, config register, counter registers and
+if needed PCI device */
+extern RegisterMap* counter_map;
+/** \brief List of boxes with name, config register, counter registers and if
+needed PCI device. Mainly used in Uncore handling but also core-local counters
+are defined as a box. */
+extern BoxMap* box_map;
+/** \brief List of events available for the current architecture */
+extern PerfmonEvent* eventHash;
+/** \brief List of PCI devices available for the current architecture */
+extern PciDevice* pci_devices;
+/** @}*/
+
+/* perfmon datatypes */
+extern PerfmonGroupSet *groupSet;
+extern int perfmon_numCounters;
+extern int perfmon_numCoreCounters;
+extern int perfmon_numUncoreCounters;
+extern int perfmon_numArchEvents;
 
 
 #endif /*PERFMON_TYPES_H*/
diff --git a/src/includes/perfmon_westmere.h b/src/includes/perfmon_westmere.h
index c469766..056a2a7 100644
--- a/src/includes/perfmon_westmere.h
+++ b/src/includes/perfmon_westmere.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_westmere.h
  *
- *      Description:  Header File of perfmon module for Westmere.
+ *      Description:  Header File of perfmon module for Intel Westmere.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +30,6 @@
  */
 
 #include <perfmon_westmere_events.h>
-#include <perfmon_westmere_groups.h>
 
-static int perfmon_numGroupsWestmere = NUM_GROUPS_WESTMERE;
 static int perfmon_numArchEventsWestmere = NUM_ARCH_EVENTS_WESTMERE;
 
diff --git a/src/includes/perfmon_westmereEX.h b/src/includes/perfmon_westmereEX.h
index 8cbc921..a0c52ac 100644
--- a/src/includes/perfmon_westmereEX.h
+++ b/src/includes/perfmon_westmereEX.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_westmereEX.h
  *
- *      Description:  Header File of perfmon module for Westmere EX.
+ *      Description:  Header File of perfmon module for Intel Westmere EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,736 +30,982 @@
  */
 
 #include <perfmon_westmereEX_events.h>
-#include <perfmon_westmereEX_groups.h>
 #include <perfmon_westmereEX_counters.h>
+#include <perfmon_nehalemEX_westmereEX_common.h>
+#include <error.h>
+#include <affinity.h>
 
 
 static int perfmon_numCountersWestmereEX = NUM_COUNTERS_WESTMEREEX;
-static int perfmon_numGroupsWestmereEX = NUM_GROUPS_WESTMEREEX;
 static int perfmon_numArchEventsWestmereEX = NUM_ARCH_EVENTS_WESTMEREEX;
 
-static PerfmonUnit westmereEX_PMunits[NUM_UNITS];
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void perfmon_init_westmereEX(PerfmonThread *thread)
+int perfmon_init_westmereEX(int cpu_id)
 {
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+uint32_t wex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int wex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* Initialize uncore */
-    /* MBOX */
-    thread->counters[PMC7].id  = 0;
-    thread->counters[PMC8].id  = 1;
-    thread->counters[PMC9].id  = 2;
-    thread->counters[PMC10].id = 3;
-    thread->counters[PMC11].id = 4;
-    thread->counters[PMC12].id = 5;
-    westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC13].id = 0;
-    thread->counters[PMC14].id = 1;
-    thread->counters[PMC15].id = 2;
-    thread->counters[PMC16].id = 3;
-    thread->counters[PMC17].id = 4;
-    thread->counters[PMC18].id = 5;
-    westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
-    /* BBOX */
-    thread->counters[PMC19].id = 0;
-    thread->counters[PMC20].id = 1;
-    thread->counters[PMC21].id = 2;
-    thread->counters[PMC22].id = 3;
-    westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX0].statusRegister =  MSR_B0_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC23].id = 0;
-    thread->counters[PMC24].id = 1;
-    thread->counters[PMC25].id = 2;
-    thread->counters[PMC26].id = 3;
-    westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX1].statusRegister =  MSR_B1_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
-    /* RBOX */
-    thread->counters[PMC27].id = 0;
-    thread->counters[PMC28].id = 1;
-    thread->counters[PMC29].id = 2;
-    thread->counters[PMC30].id = 3;
-    thread->counters[PMC31].id = 4;
-    thread->counters[PMC32].id = 5;
-    thread->counters[PMC33].id = 6;
-    thread->counters[PMC34].id = 7;
-    westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX0].statusRegister =  MSR_R0_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC35].id = 0;
-    thread->counters[PMC36].id = 1;
-    thread->counters[PMC37].id = 2;
-    thread->counters[PMC38].id = 3;
-    thread->counters[PMC39].id = 4;
-    thread->counters[PMC40].id = 5;
-    thread->counters[PMC41].id = 6;
-    thread->counters[PMC42].id = 7;
-    westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX1].statusRegister =  MSR_R1_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
-    /* WBOX */
-    thread->counters[PMC43].id = 0;
-    thread->counters[PMC44].id = 1;
-    thread->counters[PMC45].id = 2;
-    thread->counters[PMC46].id = 3;
-    thread->counters[PMC47].id = 31;
-    westmereEX_PMunits[WBOX].ctrlRegister   = MSR_W_PMON_BOX_CTRL;
-    westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
-    westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC48].id = 0;
-    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
-    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
-    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
-    /* Set IDs for all CBOXes */
-    int walker = 0;
-    for (int i=PMC49; i<=PMC98; i++)
-    {
-        thread->counters[i].id = walker;
-        walker = (walker == 4 ? 0 : walker + 1);
-    }
-    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX8].ctrlRegister   = MSR_C8_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX8].statusRegister = MSR_C8_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX8].ovflRegister   = MSR_C8_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX9].ctrlRegister   = MSR_C9_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX9].statusRegister = MSR_C9_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX9].ovflRegister   = MSR_C9_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC99].id = 0;
-    thread->counters[PMC100].id = 1;
-    thread->counters[PMC101].id = 2;
-    thread->counters[PMC102].id = 3;
-    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
-    thread->counters[PMC103].id = 0;
-    thread->counters[PMC104].id = 1;
-    thread->counters[PMC105].id = 2;
-    thread->counters[PMC106].id = 3;
-    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        msr_write(cpu_id, MSR_W_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL,   0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
+    uint64_t offcore_flags = 0x0ULL;
 
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
         {
-            uint32_t ubflags = 0x0UL;
-            ubflags |= (1<<29); /* reset all */
-            msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
         }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
     }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-#define MBOX_GATE(NUM)  \
-    flags = 0x41ULL; \
-switch (event->cfgBits)  \
-{  \
-    case 0x00:   /* primary Event */  \
-        flags |= (event->eventId<<9);  \
-        break;  \
-    case 0x01: /* secondary Events */  \
-        /* TODO fvid index is missing defaults to 0 */   \
-        flags |= (1<<7); /* toggle flag mode */   \
-        flags |= (event->eventId<<19);   \
-        switch (event->eventId)   \
-        {   \
-            case 0x00: /* CYCLES_DSP_FILL: DSP */   \
-                {   \
-                    uint64_t dsp_flags = 0x0ULL;   \
-                    dsp_flags |= (event->umask<<7);  \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-                }   \
-                break;   \
-            case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
-                {   \
-                    uint32_t iss_flags = 0x0UL;   \
-                    iss_flags |= (event->umask<<4);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-                }    \
-                break;   \
-            case 0x05: /* CYCLES_PGT_STATE: PGT */   \
-                {   \
-                    uint32_t pgt_flags = 0x0UL;   \
-                    pgt_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-                }    \
-                break;   \
-            case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
-                {   \
-                    uint32_t map_flags = 0x0UL;   \
-                    map_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
-                }   \
-                break;   \
-        }    \
-        break;   \
-    case 0x02: /* DRAM_CMD: PLD/ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pld_flags = 0x0UL;   \
-            uint32_t iss_flags = 0x0UL;   \
-            pld_flags |= (event->umask<<8);   \
-            if (event->cmask != 0)   \
-            {   \
-                iss_flags |= (event->cmask<<7);   \
-                pld_flags |= 1; /* toggle cmd flag */   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x03: /* DSP_FILL: DSP */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t dsp_flags = 0x0ULL;   \
-            dsp_flags |= (event->umask<<7);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-        }   \
-        break;   \
-    case 0x04: /* DRAM_MISC: PLD */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t pld_flags = 0x0ULL;   \
-            switch (event->cmask)   \
-            {   \
-                case 0x0:   \
-                            pld_flags |= (1<<16);   \
-                pld_flags |= (event->umask<<19);   \
-                break;   \
-                case 0x1:   \
-                            pld_flags |= (event->umask<<18);   \
-                break;   \
-                case 0x2:   \
-                            pld_flags |= (event->umask<<17);   \
-                break;   \
-                case 0x3:   \
-                            pld_flags |= (event->umask<<7);   \
-                break;   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-        }   \
-        break;   \
-    case 0x05: /* FRM_TYPE: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x06: /* FVC_EV0: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<12);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
-        }   \
-        break;   \
-    case 0x07: /* FVC_EV1: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<15);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
-        }   \
-        break;   \
-    case 0x08: /* FVC_EV2: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<18);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
-        }   \
-        break;   \
-    case 0x09: /* FVC_EV3: FVC(ZDP) */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<21);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-        }   \
-        break;   \
-    case 0x0A: /* ISS_SCHED: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= (event->umask<<10);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x0B: /* PGT_PAGE_EV: PGT */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pgt_flags = 0x0UL;   \
-            pgt_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-        }   \
-        break;   \
-    case 0x0C: /* PGT_PAGE_EV2: PGT */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pgt_flags = 0x0UL;   \
-            pgt_flags |= (event->umask<<11);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-        }   \
-        break;   \
-    case 0x0D: /* THERM_TRP_DN: THR */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t thr_flags = 0x0UL;   \
-            thr_flags |= (1<<3);   \
-            thr_flags |= (event->umask<<9);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
-        }   \
-        break;   \
+int wex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = 0x1ULL;
+    flags |=  (event->eventId<<1);
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_MATCH0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_BBOX_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, event->options[j].value, SETUP_BBOX_MASK);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
-/* RBOX macros */
-#define RBOX_GATE(NUM)  \
-    flags = 0x01ULL; /* set local enable flag */ \
-switch (event->eventId) {  \
-    case 0x00:  \
-                flags |= (event->umask<<1); /* configure sub register */   \
-    {  \
-        uint32_t iperf_flags = 0x0UL;   \
-        iperf_flags |= (event->cfgBits<<event->cmask); /* configure event */  \
-        switch (event->umask) { /* pick correct iperf register */  \
-            case 0x00: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P0, iperf_flags);   \
-            break; \
-            case 0x01: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P0, iperf_flags);   \
-            break; \
-            case 0x06: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P1, iperf_flags);   \
-            break; \
-            case 0x07: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P1, iperf_flags);   \
-            break; \
-            case 0x0C: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P2, iperf_flags);   \
-            break; \
-            case 0x0D: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P2, iperf_flags);   \
-            break; \
-            case 0x12: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P3, iperf_flags);   \
-            break; \
-            case 0x13: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P3, iperf_flags);   \
-            break; \
-        } } \
-    break; \
-    case 0x01: \
-               flags |= (event->umask<<1); /* configure sub register */   \
-    { \
-        uint32_t qlx_flags = 0x0UL;   \
-        qlx_flags |= (event->cfgBits); /* configure event */  \
-        if (event->cmask) qlx_flags |= (event->cmask<<4);  \
-        switch (event->umask) { /* pick correct qlx register */  \
-            case 0x02: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x03: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, (qlx_flags<<8));   \
-            break; \
-            case 0x08: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x09: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P1, (qlx_flags<<8));   \
-            break; \
-            case 0x0E: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x0F: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P2, (qlx_flags<<8));   \
-            break; \
-            case 0x14: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x15: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P3, (qlx_flags<<8));   \
-            break; \
-        } } \
-    break; \
+int wex_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_CBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
 }
 
+int wex_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+    int j;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags |= (1ULL<<22); /* set enable bit */
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
 
-void perfmon_setupCounterThread_westmereEX(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int wex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
+    int j;
     uint64_t flags = 0x0ULL;
-    uint64_t reg = westmereEX_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    int write_mm_cfg = 0;
+    RegisterType type = counter_map[index].type;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        haveLock = 1;
+        return 0;
     }
 
-    switch (westmereEX_counter_map[index].type)
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-            flags = (1<<22)|(1<<16);
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1,event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MATCH);
+                        write_mm_cfg = 1;
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2,event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MASK);
+                        write_mm_cfg = 1;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (write_mm_cfg && event->eventId == 0x0)
+    {
+        if (type == SBOX0)
+        {
+            VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG ,(1ULL<<63)));
+        }
+        else if (type == SBOX1)
+        {
+            VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG ,(1ULL<<63)));
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int wex_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
+    flags = (1ULL<<22);
+    flags |= (event->eventId & 0xFF);
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, UBOX_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
-                break;
 
-        case FIXED:
-            fixed_flags |= (0x2 <<(index*4));
-            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
-            break;
 
-        case MBOX0:
-            if (haveLock)
+
+int wex_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x41ULL;
+    uint64_t subflags1 = 0x0ULL;
+    uint64_t subflags2 = 0x0ULL;
+    int number;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (((counter_map[index].configRegister& 0xFF0) == 0xCA0) ||
+       ((counter_map[index].configRegister& 0xFF0) == 0xCB0))
+        number = 0;
+    else
+        number = 1;
+
+    if (event->numberOfOptions > 0 && (event->cfgBits == 0x02 || event->cfgBits == 0x04))
+    {
+        for (int j=0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_MATCH0:
+                    subflags2 = (event->options[j].value & 0x3FFFFFFFFULL);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MATCH], subflags2, SETUP_MBOX_ADDR_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    subflags2 = ((event->options[j].value & 0x1FFFFFFC0ULL)>>6);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ADDR_MASK], subflags2));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ADDR_MASK], subflags2, SETUP_MBOX_ADDR_MASK);
+                    break;
+                default:
+                    break;
+            }
+        }
+        subflags2 = 0x0ULL;
+    }
+    switch (event->cfgBits)
+    {
+        case 0x00:
+            flags |= (event->eventId & 0x1FULL)<<9; 
+            break;
+        case 0x01:
+            flags |= (1ULL<<7);
+            flags |= (event->eventId & 0x7ULL)<<19;
+            switch (event->eventId)
             {
-                MBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+                case 0x00:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+                    subflags1 |= (event->umask & 0xFULL)<<7;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+                    break;
+                case 0x01:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+                    subflags1 |= (event->umask & 0x7ULL)<<4;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+                    break;
+                case 0x05:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+                    subflags1 |= (event->umask & 0x1ULL)<<6;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+                    break;
+                case 0x06:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], &subflags1));
+                    subflags1 |= (event->umask & 0x7ULL)<<6;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][MAP], subflags1));
+                    VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][MAP], subflags1, SETUP_MBOX_MAP);
+                    break;
             }
             break;
-
-        case MBOX1:
-            if (haveLock)
+        case 0x02:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags2));
+            subflags1 |= (event->umask & 0x1FULL)<<8;
+            if ((event->cmask & 0xF0ULL) != 0)
+            {
+                subflags1 |= (1ULL<<0);
+            }
+            if ((event->cmask & 0xFULL) != 0)
             {
-                MBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
+                subflags2 |= (event->cmask & 0x7ULL)<<7;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags2));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags2, SETUP_MBOX_ISS);
             break;
-
-        case BBOX0:
-
-        case BBOX1:
-            if (haveLock)
+        case 0x03:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], &subflags1));
+            subflags1 |= (event->umask & 0xFULL)<<7;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][DSP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][DSP], subflags1, SETUP_MBOX_DSP);
+            break;
+        case 0x04:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], &subflags1));
+            switch (event->cmask)
             {
-                flags = 0x1ULL; /* set enable bit */
-                flags |=  (event->eventId<<1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+                case 0x0:
+                    subflags1 |= (1ULL<<16);
+                    subflags1 |= (event->umask & 0x1FULL)<<19;
+                    break;
+                case 0x1:
+                    subflags1 |= (event->umask & 0x1ULL)<<18;
+                    break;
+                case 0x2:
+                    subflags1 |= (event->umask & 0x1ULL)<<17;
+                    break;
+                case 0x3:
+                    subflags1 |= (event->umask & 0x1ULL)<<7;
+                    break;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PLD], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PLD], subflags1, SETUP_MBOX_PLD);
             break;
-
-        case RBOX0:
-            if (haveLock)
+        case 0x05:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+            subflags1 |= (event->umask & 0xFULL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+            break;
+        case 0x06:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<12;
+            if (event->umask == 0x5)
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
+            }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+            break;
+        case 0x07:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<15;
+            if (event->umask == 0x5)
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
+            }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+            break;
+        case 0x08:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<18;
+            if (event->umask == 0x5)
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
+            }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+            break;
+        case 0x09:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], &subflags1));
+            subflags1 |= (event->umask & 0x7ULL)<<21;
+            if (event->umask == 0x5)
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<6;
+            }
+            else
+            {
+                subflags1 |= (event->cmask & 0x7ULL)<<9;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ZDP], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ZDP], subflags1, SETUP_MBOX_ZDP);
+            break;
+        case 0x0A:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL)<<10;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][ISS], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][ISS], subflags1, SETUP_MBOX_ISS);
+            break;
+        case 0x0B:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+            break;
+        case 0x0C:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], &subflags1));
+            subflags1 |= (event->umask & 0x1ULL)<<11;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][PGT], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][PGT], subflags1, SETUP_MBOX_PGT);
+            break;
+        case 0x0D:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+            subflags1 |= (event->umask & 0x3ULL)<<9;
+            if (event->cmask == 0x0)
+            {
+                subflags1 |= (1ULL<<3);
+            }
+            else
+            {
+                subflags1 &= ~(1ULL<<3);
+                subflags1 |= (event->cmask & 0x7ULL)<<4;
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
+            break;
+        case 0x0E:
+            flags |= (event->eventId & 0x1FULL)<<9;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], &subflags1));
+            subflags1 |= (event->umask & 0x3ULL)<<7;
+            if (event->cmask == 0x0)
+            {
+                subflags1 |= (1ULL<<3);
+            }
+            else
             {
-                RBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
+                subflags1 &= ~(1ULL<<3);
+                subflags1 |= (event->cmask & 0x7ULL)<<4;
             }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_mbox_regs[number][THR], subflags1));
+            VERBOSEPRINTREG(cpu_id, nex_wex_mbox_regs[number][THR], subflags1, SETUP_MBOX_THR);
             break;
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX)
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
 
-        case RBOX1:
-            if (haveLock)
+
+int wex_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x01ULL;
+    uint64_t subflags = 0x0ULL;
+    int number;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if ((counter_map[index].configRegister & 0xFF0) == 0xE10)
+        number = 0;
+    else if ((counter_map[index].configRegister & 0xFF0) == 0xE30)
+        number = 1;
+
+    switch (event->eventId) {
+        case 0x00:
+            flags |= (event->umask & 0x1FULL)<<1;
+            subflags |= (event->cfgBits<<event->cmask);
+            switch (event->umask)
             {
-                RBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
+                case 0x00:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][0], subflags));
+                    break;
+                case 0x01:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][0], subflags));
+                    break;
+                case 0x06:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][1], subflags));
+                    break;
+                case 0x07:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][1], subflags));
+                    break;
+                case 0x0C:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][2], subflags));
+                    break;
+                case 0x0D:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][2], subflags));
+                    break;
+                case 0x12:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF0][3], subflags));
+                    break;
+                case 0x13:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][IPERF1][3], subflags));
+                    break;
             }
             break;
+        case 0x01:
+            flags |= (event->umask & 0x1FULL)<<1;
+            subflags |= (event->cfgBits & 0xFULL);
+            if (event->cmask != 0x0)
+            {
+                subflags |= (event->cmask & 0xFULL)<<4;
+            }
+            switch (event->umask)
+            {
+                case 0x02:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], subflags));
+                    break;
+                case 0x03:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][0], (subflags<<8)));
+                    break;
+                case 0x08:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], subflags));
+                    break;
+                case 0x09:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][1], (subflags<<8)));
+                    break;
+                case 0x0E:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], subflags));
+                    break;
+                case 0x0F:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][2], (subflags<<8)));
+                    break;
+                case 0x14:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], subflags));
+                    break;
+                case 0x15:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, nex_wex_rbox_regs[number][QLX][3], (subflags<<8)));
+                    break;
+            }
+            break;
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_RBOX)
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+
+int wex_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    uint64_t freeze_flags = 0x0ULL;
 
-        case WBOX:
-            if (haveLock)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &freeze_flags));
+        freeze_flags &= ~(1ULL<<28);
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+    }
+    if (flags != FREEZE_FLAG_ONLYFREEZE)
+    {
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            uint64_t clear_flags = 0x0ULL;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+            clear_flags |= 29;
+            VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, CLEAR_UNCORE_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+        }
+        else if (flags & FREEZE_FLAG_CLEAR_CTL)
+        {
+            int ret = 0;
+            for (int i=0;i < eventSet->numberOfEvents;i++)
             {
-                if (event->eventId == 0xFF)  /* Fixed Counter */
+                uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+                if (reg != 0x0ULL)
                 {
-                    flags = 0x1ULL; /* set enable bit */
+                    ret = HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL);
+                    if (ret != 0)
+                        continue;
+                    VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
                 }
-                else
+            }
+        }
+
+    }
+    return 0;
+}
+
+int wex_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    uint64_t unfreeze_flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (flags != FREEZE_FLAG_ONLYFREEZE)
+    {
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            uint64_t clear_flags = 0x0ULL;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+            clear_flags |= 29;
+            VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST clear_flags, CLEAR_UNCORE_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, clear_flags));
+        }
+        else if (flags & FREEZE_FLAG_CLEAR_CTL)
+        {
+            for (int i=0;i < eventSet->numberOfEvents;i++)
+            {
+                uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+                if (reg != 0x0ULL)
                 {
-                    flags |= (1<<22); /* set enable bit */
-                    flags |= (event->umask<<8) + event->eventId;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
                 }
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
             }
-            break;
+        }
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &unfreeze_flags));
+        unfreeze_flags |= (1ULL<<28);
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST unfreeze_flags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, unfreeze_flags));
+    }
+    return 0;
+}
+
+#define WEX_RESET_OVF_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, 0xFFFFFFFF)); \
+    }
+
+
+int perfmon_setupCounterThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t ubox_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                wex_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= wex_fixed_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0:
+            case MBOX1:
+                wex_mbox_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                wex_bbox_setup(cpu_id, index, event);
+                break;
 
-        case UBOX:
-            if (haveLock)
-            {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
-            }
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-        case CBOX8:
-        case CBOX9:
-        case SBOX0:
-        case SBOX1:
-            if (haveLock)
-            {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
+            case RBOX0:
+            case RBOX1:
+                wex_rbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                wex_wbox_setup(cpu_id, index, event);
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+                wex_cbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX0FIX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
+                {
+                    flags = 0x1;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, WBOX0FIX_CTRL);
+                    eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
+                }
+                break;
+
+            case UBOX:
+                wex_ubox_setup(cpu_id, index, event);
+                ubox_flags = 0x1ULL;
+
+            case SBOX0:
+            case SBOX1:
+                wex_sbox_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+        if (type != WBOX0FIX)
+        {
+            uflags[type] |= (1U<<getCounterTypeOffset(index));
+        }
+        else
+        {
+            uflags[WBOX] |= (1<<31);
+        }
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for ( int i=0; i<NUM_UNITS; i++ )
+        {
+            if ((uflags[i] != 0x0ULL) && (i != WBOX0FIX))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i], CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i]));
+                VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i], CLEAR_OVF_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i]));
             }
-            break;
+        }
+    }
 
-        default:
-            /* should never be reached */
-            break;
+    if (fixed_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    if (ubox_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
+    }
+    return 0;
 }
 
 /* Actions for Performance Monitoring Session:
@@ -777,167 +1024,323 @@ void perfmon_setupCounterThread_westmereEX(
  * 3) Set enable bit in global U Box control register
  * */
 
-void perfmon_startCountersThread_westmereEX(int thread_id)
+
+int perfmon_startCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags[NUM_UNITS];
-    int enable_ubox = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t core_ctrl_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<29); /* reset all */
         haveLock = 1;
-        //        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
-        //       VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
     }
 
-    for ( int i=0; i<NUM_UNITS; i++ )
-    {
-        uflags[i] = 0x0UL;
-    }
+    //wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
 
-    for ( int i=0; i<NUM_PMC; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
-            if (westmereEX_counter_map[i].type == PMC)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                continue;
             }
-            else if (westmereEX_counter_map[i].type == FIXED)
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            switch (type)
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
-            }
-            else if (westmereEX_counter_map[i].type > UNCORE)
-            {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                    uflags[westmereEX_counter_map[i].type] |=
-                        (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
-                    if (westmereEX_counter_map[i].type == UBOX)
-                    {
-                        enable_ubox = 1;
-                    }
-                }
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
             }
         }
     }
 
-    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
 
-    if (haveLock)
+    wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
+
+    /* Finally enable counters */
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+    }
+    return 0;
+}
+
+#define WEX_CHECK_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+        } \
+    }
+
+#define WEX_CLEAR_OVERFLOW(id, offset) \
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, (1<<offset)));
+
+
+#define WEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        int check_local = 0; \
+        if ((id == SBOX0) || (id == SBOX1) || (id == WBOX) || (id == UBOX)) \
+        { \
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_STATUS, &tmp)); \
+            int gl_offset = -1; \
+            switch (id) \
+            { \
+                case UBOX: \
+                    gl_offset = 0; \
+                    break; \
+                case WBOX: \
+                    gl_offset = 1; \
+                    break; \
+                case SBOX1: \
+                    gl_offset = 2; \
+                    break; \
+                case SBOX0: \
+                    gl_offset = 3; \
+                    break; \
+                default: \
+                    break; \
+            } \
+            if ((gl_offset != -1) && (tmp & (1ULL<<gl_offset))) \
+            { \
+                check_local = 1; \
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, (1ULL<<gl_offset))); \
+            } \
+        } \
+        else \
+        { \
+            check_local = 1; \
+        } \
+        if (check_local) \
+        { \
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+            if (tmp & (1ULL<<offset)) \
+            { \
+                eventSet->events[i].threadCounter[thread_id].overflows++; \
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+            } \
+        } \
+    }
+
+int perfmon_stopCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int i;
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        for ( int i=0; i<NUM_UNITS; i++ )
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
+
+    for (i = 0; i < eventSet->numberOfEvents; i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            /* if counters are enabled write the according box ctrl register */
-            if (uflags[i]) 
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
-                VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+                continue;
             }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        WEX_CHECK_UNCORE_OVERFLOW(type, index);
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
-
-        /* set global enable flag in U BOX ctrl register */
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<28); /* enable all */
-        if (enable_ubox)
-        {
-            ubflags |= (1<<0);
-        }
-        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
-    /* Finally enable counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+
+    return 0;
 }
 
-void perfmon_stopCountersThread_westmereEX(int thread_id)
+int perfmon_readCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_ctrl_flags = 0x0ULL;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
         haveLock = 1;
-        //        ubflags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ ) 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+    }
+    wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            if (type > UNCORE)
             {
                 if(haveLock)
                 {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                    VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
                 }
             }
-            else
+            else if (type == FIXED)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                        LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                WEX_CHECK_OVERFLOW(PMC, index+32);
+                VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+            }
+            else if (type == PMC)
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-#if 0
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
     {
-        printf ("Overflow occured \n");
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
     }
-#endif
+    return 0;
 }
 
-void perfmon_readCountersThread_westmereEX(int thread_id)
+
+int perfmon_finalizeCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
-            {
-                if(haveLock)
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
                 {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
                 }
-            }
-            else
-            {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-            }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                if (((haveLock) && (type > UNCORE)))
+                {
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+                }
+                break;
+        }
+        if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_PMC_AND_FIXED_OVERFLOW);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVERFLOW);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    return 0;
 }
-
diff --git a/src/includes/perfmon_westmereEX_counters.h b/src/includes/perfmon_westmereEX_counters.h
index fd65746..85e4c6d 100644
--- a/src/includes/perfmon_westmereEX_counters.h
+++ b/src/includes/perfmon_westmereEX_counters.h
@@ -5,13 +5,14 @@
  *
  *      Description: Counter Header File of perfmon module for Westmere EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,125 +30,170 @@
  */
 
 #define NUM_COUNTERS_CORE_WESTMEREEX 7
-#define NUM_COUNTERS_UNCORE_WESTMEREEX 107
-#define NUM_COUNTERS_WESTMEREEX 107
+#define NUM_COUNTERS_UNCORE_WESTMEREEX 117
+#define NUM_COUNTERS_WESTMEREEX 117
 
-static PerfmonCounterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
+#define WEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define WEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define WEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define WEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, WEX_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, WEX_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, WEX_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, WEX_VALID_OPTIONS_PMC},
     /* MBOX */
-    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0},
-    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0},
-    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0},
-    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0},
-    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0},
-    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0},
-    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0},
-    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0},
-    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0},
-    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0},
-    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0},
-    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0},
+    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
     /* BBOX */
-    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0},
-    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0},
-    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0},
-    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0},
-    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0},
-    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0},
-    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0},
-    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0},
+    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
     /* RBOX */
-    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0},
-    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0},
-    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0},
-    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0},
-    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0},
-    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0},
-    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0},
-    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0},
-    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0},
-    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0},
-    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0},
-    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0},
-    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0},
-    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0},
-    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0},
-    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0},
+    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
     /* WBOX */
-    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0},
-    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0},
-    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0},
-    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0},
-    {"WBOX4",PMC47, WBOX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0},
+    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
     /* UBOX */
-    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0},
+    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
     /* CBOXes */
-    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0},
-    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0},
-    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0},
-    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0},
-    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0},
-    {"CBOX1C0",PMC54, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0},
-    {"CBOX1C1",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0},
-    {"CBOX1C2",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0},
-    {"CBOX1C3",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0},
-    {"CBOX1C4",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0},
-    {"CBOX2C0",PMC59, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0},
-    {"CBOX2C1",PMC60, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0},
-    {"CBOX2C2",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0},
-    {"CBOX2C3",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0},
-    {"CBOX2C4",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0},
-    {"CBOX3C0",PMC64, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0},
-    {"CBOX3C1",PMC65, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0},
-    {"CBOX3C2",PMC66, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0},
-    {"CBOX3C3",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0},
-    {"CBOX3C4",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0},
-    {"CBOX4C0",PMC69, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0},
-    {"CBOX4C1",PMC70, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0},
-    {"CBOX4C2",PMC71, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0},
-    {"CBOX4C3",PMC72, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0},
-    {"CBOX4C4",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0},
-    {"CBOX5C0",PMC74, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0},
-    {"CBOX5C1",PMC75, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0},
-    {"CBOX5C2",PMC76, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0},
-    {"CBOX5C3",PMC77, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0},
-    {"CBOX5C4",PMC78, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0},
-    {"CBOX6C0",PMC79, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0},
-    {"CBOX6C1",PMC80, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0},
-    {"CBOX6C2",PMC81, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0},
-    {"CBOX6C3",PMC82, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0},
-    {"CBOX6C4",PMC83, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0},
-    {"CBOX7C0",PMC84, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0},
-    {"CBOX7C1",PMC85, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0},
-    {"CBOX7C2",PMC86, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0},
-    {"CBOX7C3",PMC87, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0},
-    {"CBOX7C4",PMC88, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0},
-    {"CBOX8C0",PMC89, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0},
-    {"CBOX8C1",PMC90, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0},
-    {"CBOX8C2",PMC91, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0},
-    {"CBOX8C3",PMC92, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0},
-    {"CBOX8C4",PMC93, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0},
-    {"CBOX9C0",PMC94, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0},
-    {"CBOX9C1",PMC95, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0},
-    {"CBOX9C2",PMC96, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0},
-    {"CBOX9C3",PMC97, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0},
-    {"CBOX9C4",PMC98, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0},
+    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C5",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C0",PMC97, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C1",PMC98, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C2",PMC99, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C3",PMC100, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C4",PMC101, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C5",PMC102, CBOX8, MSR_C8_PMON_EVNT_SEL5, MSR_C8_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C0",PMC103, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C1",PMC104, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C2",PMC105, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C3",PMC106, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C4",PMC107, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C5",PMC108, CBOX9, MSR_C9_PMON_EVNT_SEL5, MSR_C9_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
     /* SBOXes */
-    {"SBOX0C0",PMC99 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0},
-    {"SBOX0C1",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0},
-    {"SBOX0C2",PMC101, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0},
-    {"SBOX0C3",PMC102, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0},
-    {"SBOX1C0",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0},
-    {"SBOX1C1",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0},
-    {"SBOX1C2",PMC105, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0},
-    {"SBOX1C3",PMC106, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0}
+    {"SBOX0C0",PMC109 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC110, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC111, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC112, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC113, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC114, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC115, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC116, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX}
 };
 
+
+static BoxMap westmereEX_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+    [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+    [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+    [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+    [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+    [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+    [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX8] = {MSR_C8_PMON_BOX_CTRL, MSR_C8_PMON_BOX_STATUS, MSR_C8_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX9] = {MSR_C9_PMON_BOX_CTRL, MSR_C9_PMON_BOX_STATUS, MSR_C9_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_westmereEX_events.txt b/src/includes/perfmon_westmereEX_events.txt
index 2aabf8d..014dfa6 100644
--- a/src/includes/perfmon_westmereEX_events.txt
+++ b/src/includes/perfmon_westmereEX_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_westmereEX_events.txt
-# 
+#
 #      Description:  Event list for Intel WestmereEX
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -68,16 +69,16 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
+UMASK_UOPS_ISSUED_ANY            0x01
 UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1  0x01
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
-UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20 
-UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80 
+UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20
+UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80
 
 EVENT_FP_COMP_OPS_EXE            0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87        0x01
@@ -253,10 +254,10 @@ UMASK_BR_INST_EXEC_INDIRECT_NON_CALL     0x04
 UMASK_BR_INST_EXEC_NON_CALLS             0x07
 UMASK_BR_INST_EXEC_RETURN_NEAR           0x08
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL      0x10
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20 
-UMASK_BR_INST_EXEC_NEAR_CALLS            0x30 
-UMASK_BR_INST_EXEC_TAKEN                 0x40 
-UMASK_BR_INST_EXEC_ANY                   0x7F 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20
+UMASK_BR_INST_EXEC_NEAR_CALLS            0x30
+UMASK_BR_INST_EXEC_TAKEN                 0x40
+UMASK_BR_INST_EXEC_ANY                   0x7F
 
 EVENT_BR_MISP_EXEC                    0x89   PMC
 UMASK_BR_MISP_EXEC_COND               0x01
@@ -473,8 +474,66 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
-EVENT_UNCORE_CYCLES                  0xFF  WBOX4
-UMASK_UNCORE_CYCLES                  0x00
+EVENT_OFFCORE_RESPONSE_0                           0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                 EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                   0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM      EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_0_LOCAL_CACHE               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE    EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_0_REMOTE_CACHE              0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY          0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY              0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY          0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                    0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY         0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY             0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY         0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY         0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY             0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY         0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY             0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY      0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                           0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                 EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                   0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM      EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_1_LOCAL_CACHE               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE    EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_1_REMOTE_CACHE              0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY          0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY              0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY          0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                    0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY         0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY             0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY         0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY         0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY             0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY         0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY             0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY      0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                 0x01 0x0F 0x10
+
+EVENT_UNCORE_CLOCK                  0xFF  WBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
 
 EVENT_C_CYCLES_TURBO                  0x04  WBOX
 UMASK_C_CYCLES_TURBO_C0               0x01
@@ -488,26 +547,26 @@ UMASK_C_CYCLES_TURBO_C7               0x80
 UMASK_C_CYCLES_TURBO_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_DIE              0x01  WBOX
-UMASK_C_C0_THROTTLE_DIE_C0               0x01              
-UMASK_C_C0_THROTTLE_DIE_C1               0x02              
-UMASK_C_C0_THROTTLE_DIE_C2               0x04              
-UMASK_C_C0_THROTTLE_DIE_C3               0x08              
-UMASK_C_C0_THROTTLE_DIE_C4               0x10              
-UMASK_C_C0_THROTTLE_DIE_C5               0x20              
-UMASK_C_C0_THROTTLE_DIE_C6               0x40              
-UMASK_C_C0_THROTTLE_DIE_C7               0x80              
-UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF              
+UMASK_C_C0_THROTTLE_DIE_C0               0x01
+UMASK_C_C0_THROTTLE_DIE_C1               0x02
+UMASK_C_C0_THROTTLE_DIE_C2               0x04
+UMASK_C_C0_THROTTLE_DIE_C3               0x08
+UMASK_C_C0_THROTTLE_DIE_C4               0x10
+UMASK_C_C0_THROTTLE_DIE_C5               0x20
+UMASK_C_C0_THROTTLE_DIE_C6               0x40
+UMASK_C_C0_THROTTLE_DIE_C7               0x80
+UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_PROCHOT          0x03  WBOX
-UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01          
-UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02          
-UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04          
-UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08          
-UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10          
-UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20          
-UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40          
-UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80          
-UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF          
+UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01
+UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02
+UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04
+UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08
+UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10
+UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20
+UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40
+UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80
+UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_TMP              0x00  WBOX
 UMASK_C_C0_THROTTLE_TMP_C0               0x01
@@ -559,8 +618,8 @@ UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
 UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
 UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
 
-EVENT_BBOX_CYCLES                  0x1B  MBOX
-UMASK_BBOX_CYCLES                  0xFF
+EVENT_MBOX_CLOCKTICKS                  0x1B  MBOX0C0|MBOX1C0
+UMASK_MBOX_CLOCKTICKS                  0xFF
 
 EVENT_CYCLES_DSP_FILL                  0x00  MBOX
 UMASK_CYCLES_DSP_FILL_RDQ_FULL         0x01 0x01 0x00
@@ -588,34 +647,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO           0x02 0x01 0x00
 UMASK_CYCLES_SCHED_MODE_ADAPTIVE         0x03 0x01 0x00
 
 EVENT_DRAM_CMD                              0x0A  MBOX
+OPTIONS_DRAM_CMD_ALL                        EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_DRAM_CMD_ALL                          0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL                      0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL                      0x00 0x02 0x00
 UMASK_DRAM_CMD_PREALL                       0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x13
 UMASK_DRAM_CMD_RAS                          0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN                   0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN                   0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x13
 UMASK_DRAM_CMD_CAS_RD_CLS                   0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x13
 UMASK_DRAM_CMD_CAS_WR_CLS                   0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x13
 UMASK_DRAM_CMD_MRS                          0x07 0x02 0x00
 UMASK_DRAM_CMD_RFR                          0x09 0x02 0x00
 UMASK_DRAM_CMD_ENSR                         0x0A 0x02 0x00
@@ -647,7 +707,6 @@ UMASK_DRAM_MISC_RETRIES_ALL              0x00 0x04 0x03
 UMASK_DRAM_MISC_RETRIES_FVID             0x01 0x04 0x03
 UMASK_DRAM_MISC_VALID                    0x01 0x04 0x02
 UMASK_DRAM_MISC_NON_NOP_TRKL             0x01 0x04 0x01
-
 UMASK_DRAM_MISC_ILLEGAL                  0x00 0x04 0x00
 UMASK_DRAM_MISC_PREALL                   0x01 0x04 0x00
 UMASK_DRAM_MISC_RAS                      0x02 0x04 0x00
@@ -704,12 +763,12 @@ UMASK_FVC_EV1_FAST_RESET              0x04 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_READS         0x05 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_WRITES        0x05 0x07 0x01
 UMASK_FVC_EV1_BBOX_RSP_ACK            0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x07
 UMASK_FVC_EV1_SMI_NB_TRIG             0x07 0x07 0x00
 
 EVENT_FVC_EV2                         0x0F  MBOX
@@ -721,30 +780,30 @@ UMASK_FVC_EV2_FAST_RESET              0x04 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_READS         0x05 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_WRITES        0x05 0x08 0x01
 UMASK_FVC_EV2_BBOX_RSP_ACK            0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x07
 UMASK_FVC_EV2_SMI_NB_TRIG             0x07 0x08 0x00
 
 EVENT_FVC_EV3                         0x10  MBOX
 UMASK_FVC_EV3_SMI_CRC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN              0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES            0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET              0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS         0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK            0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY          0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR            0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG             0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR             0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN              0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES            0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET              0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS         0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK            0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY          0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR            0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG             0x07 0x09 0x00
 
 EVENT_FVID_RACE                       0x18  MBOX
 UMASK_FVID_RACE                       0x00 0x00 0x00
@@ -799,12 +858,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE    0x03 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_MID_FALL    0x02 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_LO          0x01 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_LT_LO          0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE  0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE  0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE  0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE  0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL  0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL  0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL  0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL  0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO        0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO        0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO        0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO        0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO        0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO        0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO        0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO        0x00 0x0D 0x04
 
 EVENT_THERM_TRP_UP                    0x04  MBOX
 UMASK_THERM_TRP_UP_ALL_GT_MID_RISE    0x03 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_MID_FALL    0x02 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_LO          0x01 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_LT_LO          0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE  0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE  0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE  0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE  0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL  0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL  0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL  0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL  0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO        0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO        0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO        0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO        0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO        0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO        0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO        0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO        0x00 0x0E 0x04
 
 EVENT_TRANS_CMDS                      0x12  MBOX
 UMASK_TRANS_CMDS                      0x00 0x00 0x00
@@ -813,112 +904,165 @@ EVENT_TT_CMD_CONFLICT                 0x19  MBOX
 UMASK_TT_CMD_CONFLICT                 0x00 0x00 0x00
 
 EVENT_ACK_BEFORE_LAST_SNP             0x19  BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP             0x03
-
-EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH             0x02
+UMASK_ACK_BEFORE_LAST_SNP             0x00
 
 EVENT_CONFLICTS             0x17  BBOX0C3|BBOX1C3
-UMASK_CONFLICTS             0x03
+UMASK_CONFLICTS             0x00
 
 EVENT_COHQ_BYPASS             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS             0x03
+UMASK_COHQ_BYPASS             0x00
 
 EVENT_COHQ_IMT_ALLOC_WAIT             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT             0x03
+UMASK_COHQ_IMT_ALLOC_WAIT             0x00
 
 EVENT_DIRQ_INSERTS             0x17  BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS             0x01
+UMASK_DIRQ_INSERTS             0x00
 
 EVENT_DIRQ_OCCUPANCY             0x17  BBOX0C0|BBOX1C0
 UMASK_DIRQ_OCCUPANCY             0x00
 
 EVENT_DEMAND_FETCH             0x0F  BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH             0x03
+UMASK_DEMAND_FETCH             0x00
 
 EVENT_DRSQ_INSERTS             0x09  BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS             0x01
+UMASK_DRSQ_INSERTS             0x00
 
 EVENT_DRSQ_OCCUPANCY             0x09  BBOX0C0|BBOX1C0
 UMASK_DRSQ_OCCUPANCY             0x00
 
 EVENT_EARLY_ACK             0x02  BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK             0x03
+UMASK_EARLY_ACK             0x00
 
 EVENT_IMPLICIT_WBS             0x12  BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS             0x03
+UMASK_IMPLICIT_WBS             0x00
 
 EVENT_IMT_FULL             0x12  BBOX0C3|BBOX1C3
-UMASK_IMT_FULL             0x03
+UMASK_IMT_FULL             0x00
 
 EVENT_IMT_INSERTS_ALL             0x07  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL             0x01
+UMASK_IMT_INSERTS_ALL             0x00
 
 EVENT_IMT_INSERTS_INVITOE             0x0F  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE             0x01
+UMASK_IMT_INSERTS_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH             0x0A  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH             0x01
+UMASK_IMT_INSERTS_IOH             0x00
 
 EVENT_IMT_INSERTS_IOH_INVITOE             0x10  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_IOH_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH_WR             0x0D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR             0x01
+UMASK_IMT_INSERTS_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_NON_IOH             0x0B  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH             0x01
+UMASK_IMT_INSERTS_NON_IOH             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_INVITOE             0x1C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x00
 
-EVENT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD             0x01
+EVENT_IMT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_WR             0x0E  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR             0x01
+UMASK_IMT_INSERTS_NON_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_RD             0x1D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD             0x01
+UMASK_IMT_INSERTS_RD             0x00
 
 EVENT_IMT_INSERTS_WR             0x0C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR             0x01
+UMASK_IMT_INSERTS_WR             0x00
 
 EVENT_IMT_NE_CYCLES             0x07  BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES             0x02
+UMASK_IMT_NE_CYCLES             0x00
 
 EVENT_IMT_PREALLOC             0x06  BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC             0x03
+UMASK_IMT_PREALLOC             0x00
 
 EVENT_IMT_VALID_OCCUPANCY             0x07  BBOX0C0|BBOX1C0
 UMASK_IMT_VALID_OCCUPANCY             0x00
 
-EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
-UMASK_MSG_ADDR_IN_MATCH             0x00
+EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
+UMASK_MSGS_B_TO_S             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S             0x01
+EVENT_MSGS_S_TO_B             0x02  BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S             0x02
+EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
+UMASK_MSGS_IN_NON_SNP             0x00
 
 EVENT_MSG_IN_MATCH             0x01  BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH             0x01
+OPTIONS_MSG_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH             0x00
 
-EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP             0x02
+EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_ADDR_IN_MATCH             0x03  BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_OPCODE_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_IN_MATCH             0x05  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH             0x01
+OPTIONS_MSG_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_OUT_MATCH             0x06  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH             0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH             0x00
 
 EVENT_MSG_OUT_MATCH             0x02  BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH             0x01
+OPTIONS_MSG_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH             0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH             0x02  BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH             0x00
+
+EVENT_OPCODE_IN_MATCH             0x03  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH             0x00
+
+EVENT_OPCODE_OUT_MATCH             0x04  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH             0x00
+
+EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
+OPTIONS_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH             0x00
+
+EVENT_RBOX_VNA_UNAVAIL             0x15  BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL             0x00
+
+EVENT_SBOX_VN0_UNAVAIL             0x14  BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL             0x00
+
+EVENT_SNPOQ_INSERTS             0x12  BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS             0x00
+
+EVENT_SNPOQ_OCCUPANCY             0x12  BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY             0x00
+
+EVENT_TF_ALL             0x04  BBOX0C0|BBOX1C0
+UMASK_TF_ALL             0x00
+
+EVENT_TF_INVITOE             0x06  BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE             0x00
+
+EVENT_TF_IOH             0x0B  BBOX0C0|BBOX1C0
+UMASK_TF_IOH             0x00
+
+EVENT_TF_IOH_INVITOE             0x0F  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE             0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD             0x1C  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD             0x00
+
+EVENT_TF_IOH_WR             0x0D  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR             0x00
+
+EVENT_TF_WR             0x05  BBOX0C0|BBOX1C0
+UMASK_TF_WR             0x00
+
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX0
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB             0x00 0x01  0x09
@@ -3012,6 +3156,7 @@ EVENT_TRANS_VIQ                                 0x1D CBOX
 UMASK_TRANS_VIQ                                 0x00
 
 EVENT_TO_R_PROG_EV                              0x00 SBOX
+OPTIONS_TO_R_PROG_EV                            EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MATCH0_MASK
 UMASK_TO_R_PROG_EV                              0x00
 
 EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
diff --git a/src/includes/perfmon_westmere_events.txt b/src/includes/perfmon_westmere_events.txt
index 3c3e66f..7032ae3 100644
--- a/src/includes/perfmon_westmere_events.txt
+++ b/src/includes/perfmon_westmere_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_westmere_events.txt
-# 
+#
 #      Description:  Event list for Intel Westmere
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -68,16 +69,21 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
-UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1  0x01
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FUSED          0x02
+DEFAULT_OPTIONS_UOPS_TOTAL_STALL_CYCLES EVENT_OPTION_THRESHOLD=0xF,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_TOTAL_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_ISSUED_ACTIVE_CYCLES   0x01
+DEFAULT_OPTIONS_UOPS_ISSUED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_ISSUED_STALL_CYCLES   0x01
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
-UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20 
-UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80 
+UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20
+UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80
 
 EVENT_FP_COMP_OPS_EXE            0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87        0x01
@@ -106,7 +112,8 @@ UMASK_LOAD_DISPATCH_ANY           0x07
 
 EVENT_ARITH                      0x14   PMC
 UMASK_ARITH_CYCLES_DIV_BUSY      0x01
-UMASK_ARITH_NUM_DIV              0x01 0xC5 0x01
+DEFAULT_OPTIONS_ARITH_NUM_DIV    EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_THRESHOLD=0x1
+UMASK_ARITH_NUM_DIV              0x01
 UMASK_ARITH_MUL                  0x02
 
 EVENT_INST_QUEUE                  0x17   PMC
@@ -177,9 +184,15 @@ EVENT_L3_LAT_CACHE               0x2E   PMC
 UMASK_L3_LAT_CACHE_REFERENCE     0x4F
 UMASK_L3_LAT_CACHE_MISS          0x41
 
-EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_THREAD_P_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P_ANY  0x00
 UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_REF_XCLK_ANY EVENT_OPTION_ANYTHREAD=1
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK_ANY     0x01
+DEFAULT_OPTIONS_CPU_CLOCK_UNHALTED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0x2,EVENT_OPTION_INVERT=0x1
+UMASK_CPU_CLOCK_UNHALTED_TOTAL_CYCLES   0x00
 
 EVENT_DTLB_MISSES                0x49   PMC
 UMASK_DTLB_MISSES_ANY            0x01
@@ -217,9 +230,13 @@ UMASK_OFFCORE_EVENTS_OUTSTANDING_DEMAND_READ_CODE   0x02
 UMASK_OFFCORE_EVENTS_OUTSTANDING_DEMAND_RFO   0x04
 UMASK_OFFCORE_EVENTS_OUTSTANDING_ANY_READ   0x08
 
-EVENT_CACHE_LOCK_CYCLES          0x63   PMC0|PMC1
-UMASK_CACHE_LOCK_CYCLES_L1D_L2      0x01
+EVENT_CACHE_LOCK                  0x63   PMC0|PMC1
+UMASK_CACHE_LOCK_CYCLES_L1D_L2    0x01
+DEFAULT_OPTIONS_CACHE_LOCK_COUNT_L1D_L2 EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_COUNT_L1D_L2     0x01
 UMASK_CACHE_LOCK_CYCLES_L1D       0x02
+DEFAULT_OPTIONS_CACHE_LOCK_COUNT_L1D EVENT_OPTION_EDGE=1
+UMASK_CACHE_LOCK_COUNT_L1D        0x02
 
 EVENT_IO_TRANSACTIONS            0x6C   PMC
 UMASK_IO_TRANSACTIONS            0x01
@@ -305,15 +322,38 @@ UMASK_OFFCORE_REQUESTS_ANY               0x80
 EVENT_UOPS_EXECUTED                 0xB1   PMC
 UMASK_UOPS_EXECUTED_PORT0           0x01
 UMASK_UOPS_EXECUTED_PORT1           0x02
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT2_CORE EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_EXECUTED_PORT2_CORE      0x04
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT3_CORE EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_EXECUTED_PORT3_CORE      0x08
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT4_CORE EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_EXECUTED_PORT4_CORE      0x10
-UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 0x1F
 UMASK_UOPS_EXECUTED_PORT5           0x20
-UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES 0x3F
 UMASK_UOPS_EXECUTED_PORT015         0x40
-UMASK_UOPS_EXECUTED_PORT015_STALL_CYCLES   0x40 0xC1  0x01
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT234_CORE EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_EXECUTED_PORT234         0x80
+UMASK_UOPS_EXECUTED_THREAD          0xC0
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_COUNT 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_STALL_COUNT 0x3F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_CYCLES_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_ACTIVE_COUNT_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_CORE_ACTIVE_COUNT_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_CYCLES_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_CYCLES_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_CORE_STALL_COUNT_NO_PORT5 EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_CORE_STALL_COUNT_NO_PORT5 0x1F
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT015_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_EXECUTED_PORT015_STALL_CYCLES   0x40
+DEFAULT_OPTIONS_UOPS_EXECUTED_PORT015_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_INVERT=1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_EXECUTED_PORT015_STALL_COUNT   0x40
+
 
 EVENT_OFFCORE_REQUESTS_SQ_FULL     0xB2  PMC
 UMASK_OFFCORE_REQUESTS_SQ_FULL     0x01
@@ -343,10 +383,14 @@ UMASK_INST_RETIRED_MMX              0x04
 
 EVENT_UOPS_RETIRED                  0xC2  PMC
 UMASK_UOPS_RETIRED_ANY              0x01
-UMASK_UOPS_RETIRED_STALL_CYCLES     0x01 0xC1  0x01
-UMASK_UOPS_RETIRED_ACTIVE_CYCLES    0x01 0x41  0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
 UMASK_UOPS_RETIRED_MACRO_FUSED      0x04
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xF,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_TOTAL_CYCLES     0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_RETIRED_ACTIVE_CYCLES    0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_RETIRED_STALL_CYCLES     0x01
 
 EVENT_MACHINE_CLEARS              0xC3  PMC
 UMASK_MACHINE_CLEARS_CYCLES       0x01
@@ -392,10 +436,26 @@ EVENT_MACRO_INSTS            0xD0    PMC
 UMASK_MACRO_INSTS_DECODED            0x01
 
 EVENT_UOPS_DECODED               0xD1   PMC
-UMASK_UOPS_DECODED_STALL_CYCLES  0x01 0xC1 0x01
+UMASK_UOPS_DECODED_ANY           0x01
+DEFAULT_OPTIONS_UOPS_DECODED_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_DECODED_ACTIVE_CYCLES 0x01
+DEFAULT_OPTIONS_UOPS_DECODED_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_DECODED_ACTIVE_COUNT  0x01
+DEFAULT_OPTIONS_UOPS_DECODED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_STALL_CYCLES  0x01
+DEFAULT_OPTIONS_UOPS_DECODED_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_STALL_COUNT   0x01
 UMASK_UOPS_DECODED_MS            0x02
-UMASK_UOPS_DECODED_ESP_FOLDING   0x04
-UMASK_UOPS_DECODED_ESP_SYNC      0x08
+DEFAULT_OPTIONS_UOPS_DECODED_MS_ACTIVE_CYCLES EVENT_OPTION_THRESHOLD=0x1
+UMASK_UOPS_DECODED_MS_ACTIVE_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_ACTIVE_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1
+UMASK_UOPS_DECODED_MS_ACTIVE_COUNT 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_MS_STALL_CYCLES 0x02
+DEFAULT_OPTIONS_UOPS_DECODED_MS_STALL_COUNT EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_EDGE=1,EVENT_OPTION_INVERT=1
+UMASK_UOPS_DECODED_MS_STALL_COUNT 0x02
+UMASK_UOPS_DECODED_ESP_FOLDING    0x04
+UMASK_UOPS_DECODED_ESP_SYNC       0x08
 
 EVENT_RAT_STALLS               0xD2   PMC
 UMASK_RAT_STALLS_FLAGS         0x01
@@ -437,8 +497,8 @@ UMASK_L2_TRANSACTIONS_IFETCH        0x04
 UMASK_L2_TRANSACTIONS_PREFETCH      0x08
 UMASK_L2_TRANSACTIONS_L1D_WB        0x10
 UMASK_L2_TRANSACTIONS_L1D_FILL      0x20
-UMASK_L2_TRANSACTIONS_L1D_WB        0x40
-UMASK_L2_TRANSACTIONS_L1D_ANY       0x80
+UMASK_L2_TRANSACTIONS_L2_WB         0x40
+UMASK_L2_TRANSACTIONS_ANY           0x80
 
 EVENT_L2_LINES_IN                   0xF1   PMC
 UMASK_L2_LINES_IN_S_STATE           0x02
@@ -450,6 +510,8 @@ UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
 UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
 UMASK_L2_LINES_OUT_PREFETCH_CLEAN   0x04
 UMASK_L2_LINES_OUT_PREFETCH_DIRTY   0x08
+UMASK_L2_LINES_OUT_CLEAN_ANY        0x05
+UMASK_L2_LINES_OUT_DIRTY_ANY        0x0A
 UMASK_L2_LINES_OUT_ANY              0x0F
 
 EVENT_SQ_MISC                         0xF4  PMC
@@ -473,6 +535,17 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCK                 0x00 UPMCFIX
+UMASK_UNCORE_CLOCK                 0x00
+
 EVENT_UNC_GQ_CYCLES_FULL                0x00   UPMC
 UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER         0x01
 UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER        0x02
@@ -678,10 +751,36 @@ UMASK_UNC_QHL_SLEEPS_IOH_CONFLICT             0x08
 UMASK_UNC_QHL_SLEEPS_REMOTE_CONFLICT          0x10
 UMASK_UNC_QHL_SLEEPS_LOCAL_CONFLICT           0x20
 
-EVENT_UNC_ADDR_OPCODE_MATCH                         0x35   UPMC
-UMASK_UNC_ADDR_OPCODE_MATCH_IOH                     0x01
-UMASK_UNC_ADDR_OPCODE_MATCH_REMOTE                  0x02
-UMASK_UNC_ADDR_OPCODE_MATCH_LOCAL                   0x04
+EVENT_UNC_ADDR_OPCODE_MATCH_AND                 0x35   UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_IOH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_IOH             0x01 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_REMOTE        EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_REMOTE          0x02 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_LOCAL         EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_LOCAL           0x04 0x02 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_OR                  0x35   UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_IOH            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_IOH              0x01 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_REMOTE         EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_REMOTE           0x02 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_LOCAL          EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_LOCAL            0x04 0x0C 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPFWDS             0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_IOH         0x01 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_REMOTE      0x02 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_LOCAL       0x04 0x04 0x1A
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB              0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH          0x01 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE       0x02 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL        0x04 0x04 0x1D
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB              0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH          0x01 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE       0x02 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL        0x04 0x04 0x00
 
 EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT                0x40  UPMC
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0    0x01
@@ -789,3 +888,4 @@ UMASK_UNC_CYCLES_UNHALTED_L3_FLL_ENABLE             0x02
 
 EVENT_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE            0x86  UPMC
 UMASK_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE            0x01
+
diff --git a/src/includes/power.h b/src/includes/power.h
index 6cb5fd3..b6c26d8 100644
--- a/src/includes/power.h
+++ b/src/includes/power.h
@@ -6,13 +6,14 @@
  *      Description:  Header File Power Module
  *                    Implements Intel RAPL Interface.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -35,46 +36,174 @@
 #include <types.h>
 #include <registers.h>
 #include <bitUtil.h>
-#include <msr.h>
+#include <error.h>
+#include <access.h>
 
-extern PowerInfo power_info;
-extern  const uint32_t power_regs[4];
+const char* power_names[NUM_POWER_DOMAINS] = {"PKG", "PP0", "PP1", "DRAM"};
 
-extern void power_init(int cpuId);
-static inline void power_start(PowerData* data, int cpuId, PowerType type);
-static inline void power_stop(PowerData* data, int cpuId, PowerType type);
-static inline uint32_t power_read(int cpuId, uint64_t reg);
-static inline uint32_t power_tread(int socket_fd, int cpuId, uint64_t reg);
-static inline double power_printEnergy(PowerData* data);
+uint32_t power_regs[NUM_POWER_DOMAINS] = {MSR_PKG_ENERGY_STATUS,
+                                MSR_PP0_ENERGY_STATUS,
+                                MSR_PP1_ENERGY_STATUS,
+                                MSR_DRAM_ENERGY_STATUS};
 
-static double
+uint32_t limit_regs[NUM_POWER_DOMAINS] = {MSR_PKG_RAPL_POWER_LIMIT,
+                                MSR_PP0_RAPL_POWER_LIMIT,
+                                MSR_PP1_RAPL_POWER_LIMIT,
+                                MSR_DRAM_RAPL_POWER_LIMIT};
+
+uint32_t policy_regs[NUM_POWER_DOMAINS] = {0,
+                                MSR_PP0_ENERGY_POLICY,
+                                MSR_PP1_ENERGY_POLICY,
+                                0};
+
+uint32_t perf_regs[NUM_POWER_DOMAINS] = {MSR_PKG_PERF_STATUS,
+                                MSR_PP0_PERF_STATUS,
+                                0,
+                                MSR_DRAM_PERF_STATUS};
+
+uint32_t info_regs[NUM_POWER_DOMAINS] = {MSR_PKG_POWER_INFO,
+                                0,
+                                0,
+                                MSR_DRAM_POWER_INFO};
+
+
+double
 power_printEnergy(PowerData* data)
 {
-    return  (double) ((data->after - data->before) * power_info.energyUnit);
+    return  (double) ((data->after - data->before) * power_info.domains[data->domain].energyUnit);
 }
 
-static void
+int
 power_start(PowerData* data, int cpuId, PowerType type)
 {
-    data->before = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+    if (power_info.hasRAPL)
+    {
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            data->before = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+            data->before = field64(result, 0, 32);
+            data->domain = type;
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static void
+int
 power_stop(PowerData* data, int cpuId, PowerType type)
 {
-    data->after = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+    if (power_info.hasRAPL)
+    {
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            data->after = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+            data->after = field64(result, 0, 32);
+            data->domain = type;
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
+}
+
+int
+power_read(int cpuId, uint64_t reg, uint32_t *data)
+{
+    int i;
+    PowerType type = -1;
+
+    if (power_info.hasRAPL)
+    {
+        for (i = 0; i < NUM_POWER_DOMAINS; i++)
+        {
+            if (reg == power_regs[i])
+            {
+                type = i;
+                break;
+            }
+        }
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            *data = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+            *data = field64(result, 0, 32);
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static uint32_t
-power_read(int cpuId, uint64_t reg)
+int
+power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data)
 {
-    return extractBitField(msr_read(cpuId, reg),32,0);
+    int i;
+    PowerType type;
+    if (power_info.hasRAPL)
+    {
+        for (i = 0; i < NUM_POWER_DOMAINS; i++)
+        {
+            if (reg == power_regs[i])
+            {
+                type = i;
+                break;
+            }
+        }
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            *data = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+            *data = field64(result, 0, 32);
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static uint32_t
-power_tread(int socket_fd, int cpuId, uint64_t reg)
+double
+power_getEnergyUnit(int domain)
 {
-    return extractBitField(msr_tread(socket_fd, cpuId, reg),32,0);
+    return power_info.domains[domain].energyUnit;
 }
 
 #endif /*POWER_H*/
diff --git a/src/includes/power_types.h b/src/includes/power_types.h
index b53ce85..337e091 100644
--- a/src/includes/power_types.h
+++ b/src/includes/power_types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Types file for power module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -32,36 +33,10 @@
 #define POWER_TYPES_H
 
 #include <stdint.h>
+#include <likwid.h>
 
-typedef enum {
-    PKG = 0,
-    PP0,
-    PP1,
-    DRAM
-} PowerType;
 
-typedef struct {
-    int numSteps;
-    double* steps;
-} TurboBoost;
-
-typedef struct {
-    double baseFrequency;
-    double minFrequency;
-    TurboBoost turbo;
-    double powerUnit;
-    double energyUnit;
-    double timeUnit;
-    double tdp;
-    double minPower;
-    double maxPower;
-    double maxTimeWindow;
-} PowerInfo;
-
-typedef struct {
-    uint32_t before;
-    uint32_t after;
-} PowerData;
+extern uint32_t power_regs[NUM_POWER_DOMAINS];
 
 
 #endif /*POWER_TYPES_H*/
diff --git a/src/includes/registers.h b/src/includes/registers.h
index ae80e28..32d975e 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Register Defines for the perfmon module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -47,6 +48,10 @@
 #define MSR_PERFEVTSEL1           0x187
 #define MSR_PERFEVTSEL2           0x188
 #define MSR_PERFEVTSEL3           0x189
+#define MSR_PERFEVTSEL4           0x190
+#define MSR_PERFEVTSEL5           0x191
+#define MSR_PERFEVTSEL6           0x192
+#define MSR_PERFEVTSEL7           0x193
 #define MSR_PMC0                  0x0C1
 #define MSR_PMC1                  0x0C2
 #define MSR_PMC2                  0x0C3
@@ -60,6 +65,7 @@
 #define MSR_PERF_GLOBAL_STATUS    0x38E
 #define MSR_PERF_GLOBAL_OVF_CTRL  0x390
 #define MSR_PEBS_ENABLE           0x3F1
+#define MSR_PEBS_LD_LAT           0x3F6
 /* Perfmon V3 */
 #define MSR_OFFCORE_RESP0              0x1A6
 #define MSR_OFFCORE_RESP1              0x1A7
@@ -85,19 +91,20 @@
 #define MSR_UNCORE_PMC5                0x3B5
 #define MSR_UNCORE_PMC6                0x3B6
 #define MSR_UNCORE_PMC7                0x3B7
-/*
- * Perfmon V4 (starting with Haswell, according to
+/* 
+ * Perfmon V3 (starting with Haswell, according to 
  * Intel software developers guide also for SandyBridge,
  * IvyBridge not mentioned in this section)
  */
-#define MSR_UNC_PERF_GLOBAL_CTRL       MSR_UNCORE_PERF_GLOBAL_CTRL
-#define MSR_UNC_PERF_GLOBAL_STATUS     MSR_UNCORE_PERF_GLOBAL_STATUS
-#define MSR_UNC_PERF_FIXED_CTRL        MSR_UNCORE_FIXED_CTR0
-#define MSR_UNC_PERF_FIXED_CTR         MSR_UNCORE_FIXED_CTR_CTRL
-#define MSR_UNC_ARB_PERFEVTSEL0        MSR_UNCORE_PMC2
-#define MSR_UNC_ARB_PERFEVTSEL1        MSR_UNCORE_PMC3
-#define MSR_UNC_ARB_CTR0               MSR_UNCORE_PMC0
-#define MSR_UNC_ARB_CTR1               MSR_UNCORE_PMC1
+#define MSR_UNC_PERF_GLOBAL_CTRL       0x391
+#define MSR_UNC_PERF_GLOBAL_STATUS     0x392
+#define MSR_UNC_PERF_GLOBAL_OVF_CTRL   0x393
+#define MSR_UNC_PERF_FIXED_CTRL        0x394
+#define MSR_UNC_PERF_FIXED_CTR         0x395
+#define MSR_UNC_ARB_PERFEVTSEL0        0x3B2
+#define MSR_UNC_ARB_PERFEVTSEL1        0x3B3
+#define MSR_UNC_ARB_CTR0               0x3B0
+#define MSR_UNC_ARB_CTR1               0x3B1
 #define MSR_UNC_CBO_CONFIG             0x396
 #define MSR_UNC_CBO_0_PERFEVTSEL0      0x700
 #define MSR_UNC_CBO_0_PERFEVTSEL1      0x701
@@ -115,6 +122,38 @@
 #define MSR_UNC_CBO_3_PERFEVTSEL1      0x731
 #define MSR_UNC_CBO_3_CTR0             0x736
 #define MSR_UNC_CBO_3_CTR1             0x737
+/* Perfmon V4 starting with Skylake */
+#define MSR_V4_PERF_GLOBAL_STATUS       0x38E
+#define MSR_V4_PERF_GLOBAL_STATUS_SET   0x391
+#define MSR_V4_PERF_GLOBAL_STATUS_RESET 0x390
+#define MSR_V4_PERF_GLOBAL_INUSE        0x392
+#define MSR_V4_PEBS_FRONTEND            0x3F7
+#define MSR_V4_UNC_PERF_GLOBAL_CTRL     0xE01
+#define MSR_V4_UNC_PERF_GLOBAL_STATUS   0xE02
+#define MSR_V4_UNC_PERF_FIXED_CTRL      0x394
+#define MSR_V4_UNC_PERF_FIXED_CTR       0x395
+#define MSR_V4_ARB_PERF_FIXED_CTRL0      0x3B2
+#define MSR_V4_ARB_PERF_FIXED_CTR0       0x3B0
+#define MSR_V4_ARB_PERF_FIXED_CTRL1      0x3B3
+#define MSR_V4_ARB_PERF_FIXED_CTR1       0x3B1
+#define MSR_V4_C0_PERF_FIXED_CTRL0      0x700
+#define MSR_V4_C0_PERF_FIXED_CTR0       0x706
+#define MSR_V4_C0_PERF_FIXED_CTRL1      0x701
+#define MSR_V4_C0_PERF_FIXED_CTR1       0x707
+#define MSR_V4_C1_PERF_FIXED_CTRL0      0x710
+#define MSR_V4_C1_PERF_FIXED_CTR0       0x716
+#define MSR_V4_C1_PERF_FIXED_CTRL1      0x711
+#define MSR_V4_C1_PERF_FIXED_CTR1       0x717
+#define MSR_V4_C2_PERF_FIXED_CTRL0      0x720
+#define MSR_V4_C2_PERF_FIXED_CTR0       0x726
+#define MSR_V4_C2_PERF_FIXED_CTRL1      0x721
+#define MSR_V4_C2_PERF_FIXED_CTR1       0x727
+#define MSR_V4_C3_PERF_FIXED_CTRL0      0x730
+#define MSR_V4_C3_PERF_FIXED_CTR0       0x736
+#define MSR_V4_C3_PERF_FIXED_CTRL1      0x731
+#define MSR_V4_C3_PERF_FIXED_CTR1       0x737
+/* V4 Uncore registers the same as in V3 */
+
 /* Xeon Phi */
 #define MSR_MIC_TSC                   0x010
 #define MSR_MIC_PERFEVTSEL0           0x028
@@ -125,6 +164,10 @@
 #define MSR_MIC_PERF_GLOBAL_STATUS    0x02D
 #define MSR_MIC_PERF_GLOBAL_OVF_CTRL  0x02E
 #define MSR_MIC_PERF_GLOBAL_CTRL      0x02F
+/* Xeon Phi (Knights Landing)*/
+#define MSR_MIC2_PMC0                 0x4C1
+#define MSR_MIC2_PMC1                 0x4C2
+#define MSR_MIC2_TURBO_RATIO_LIMIT    0x1AD
 
 
 /* Core v1/v2 type uncore
@@ -324,7 +367,10 @@
 #define MSR_UNC_PCU_PMON_CTL2           0xC32
 #define MSR_UNC_PCU_PMON_CTL3           0xC33
 #define MSR_UNC_PCU_PMON_BOX_FILTER     0xC34
-#define MSR_UNC_PCU_PMON_BOX_CTL        0xD24
+#define MSR_UNC_PCU_PMON_BOX_CTL        0xC24
+#define MSR_UNC_PCU_PMON_BOX_STATUS     0xC35
+#define MSR_UNC_PCU_PMON_FIXED_CTR0     0x3FC
+#define MSR_UNC_PCU_PMON_FIXED_CTR1     0x3FD
 
 /* UBox Performance Monitoring */
 
@@ -342,6 +388,7 @@
 /* HA Box Performance Monitoring */
 
 #define PCI_UNC_HA_PMON_BOX_CTL         0xF4
+#define PCI_UNC_HA_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_HA_PMON_CTL_0           0xD8
 #define PCI_UNC_HA_PMON_CTL_1           0xDC
 #define PCI_UNC_HA_PMON_CTL_2           0xE0
@@ -378,9 +425,22 @@
 #define PCI_UNC_MC_PMON_CTR_2_B         0xB0
 #define PCI_UNC_MC_PMON_CTR_3_B         0xB8
 
+/* IRP Performance Monitoring */
+#define PCI_UNC_IRP_PMON_BOX_STATUS     0xF8
+#define PCI_UNC_IRP_PMON_BOX_CTL        0xF4
+#define PCI_UNC_IRP0_PMON_CTL_0         0xD8
+#define PCI_UNC_IRP0_PMON_CTL_1         0xDC
+#define PCI_UNC_IRP0_PMON_CTR_0         0xA0
+#define PCI_UNC_IRP0_PMON_CTR_1         0xB0
+#define PCI_UNC_IRP1_PMON_CTL_0         0xE0
+#define PCI_UNC_IRP1_PMON_CTL_1         0xE4
+#define PCI_UNC_IRP1_PMON_CTR_0         0xB8
+#define PCI_UNC_IRP1_PMON_CTR_1         0xC0
+
 /* QPI Box Performance Monitoring */
 
 #define PCI_UNC_QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_QPI_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_QPI_PMON_CTL_0           0xD8
 #define PCI_UNC_QPI_PMON_CTL_1           0xDC
 #define PCI_UNC_QPI_PMON_CTL_2           0xE0
@@ -402,6 +462,7 @@
 /* R2PCIE Box Performance Monitoring */
 
 #define PCI_UNC_R2PCIE_PMON_BOX_CTL         0xF4
+#define PCI_UNC_R2PCIE_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_R2PCIE_PMON_CTL_0           0xD8
 #define PCI_UNC_R2PCIE_PMON_CTL_1           0xDC
 #define PCI_UNC_R2PCIE_PMON_CTL_2           0xE0
@@ -418,6 +479,7 @@
 /* R3QPI Box Performance Monitoring */
 
 #define PCI_UNC_R3QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_R3QPI_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_R3QPI_PMON_CTL_0           0xD8
 #define PCI_UNC_R3QPI_PMON_CTL_1           0xDC
 #define PCI_UNC_R3QPI_PMON_CTL_2           0xE0
@@ -428,6 +490,438 @@
 #define PCI_UNC_R3QPI_PMON_CTR_1_B         0xA8
 #define PCI_UNC_R3QPI_PMON_CTR_2_B         0xB0
 
+/* ########################################################## */
+/* Core v3 type uncore
+ * Naming following Intel Uncore Performance Monitoring Guide
+ * Ref. Nr. 331051-001
+ * */
+
+/* UBox Performance Monitoring */
+#define MSR_UNC_V3_U_PMON_CTR0             0x709
+#define MSR_UNC_V3_U_PMON_CTR1             0x70A
+#define MSR_UNC_V3_U_PMON_CTL0             0x705
+#define MSR_UNC_V3_U_PMON_CTL1             0x706
+#define MSR_UNC_V3_U_UCLK_FIXED_CTR        0x704
+#define MSR_UNC_V3_U_UCLK_FIXED_CTL        0x703
+#define MSR_UNC_V3_U_PMON_BOX_STATUS       0x708
+#define MSR_UNC_V3_U_PMON_GLOBAL_STATUS    0x701
+#define MSR_UNC_V3_U_PMON_GLOBAL_CTL       0x700
+#define MSR_UNC_V3_U_PMON_GLOBAL_CONFIG    0x702
+
+/* CBox Performance Monitoring */
+#define MSR_UNC_V3_C0_PMON_BOX_CTL         0xE00
+#define MSR_UNC_V3_C0_PMON_BOX_STATUS      0xE07
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER0     0xE05
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER1     0xE06
+#define MSR_UNC_V3_C0_PMON_CTL0            0xE01
+#define MSR_UNC_V3_C0_PMON_CTL1            0xE02
+#define MSR_UNC_V3_C0_PMON_CTL2            0xE03
+#define MSR_UNC_V3_C0_PMON_CTL3            0xE04
+#define MSR_UNC_V3_C0_PMON_CTR0            0xE08
+#define MSR_UNC_V3_C0_PMON_CTR1            0xE09
+#define MSR_UNC_V3_C0_PMON_CTR2            0xE0A
+#define MSR_UNC_V3_C0_PMON_CTR3            0xE0B
+
+#define MSR_UNC_V3_C1_PMON_BOX_CTL         0xE10
+#define MSR_UNC_V3_C1_PMON_BOX_STATUS      0xE17
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER0     0xE15
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER1     0xE16
+#define MSR_UNC_V3_C1_PMON_CTL0            0xE11
+#define MSR_UNC_V3_C1_PMON_CTL1            0xE12
+#define MSR_UNC_V3_C1_PMON_CTL2            0xE13
+#define MSR_UNC_V3_C1_PMON_CTL3            0xE14
+#define MSR_UNC_V3_C1_PMON_CTR0            0xE18
+#define MSR_UNC_V3_C1_PMON_CTR1            0xE19
+#define MSR_UNC_V3_C1_PMON_CTR2            0xE1A
+#define MSR_UNC_V3_C1_PMON_CTR3            0xE1B
+
+#define MSR_UNC_V3_C2_PMON_BOX_CTL         0xE20
+#define MSR_UNC_V3_C2_PMON_BOX_STATUS      0xE27
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER0     0xE25
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER1     0xE26
+#define MSR_UNC_V3_C2_PMON_CTL0            0xE21
+#define MSR_UNC_V3_C2_PMON_CTL1            0xE22
+#define MSR_UNC_V3_C2_PMON_CTL2            0xE23
+#define MSR_UNC_V3_C2_PMON_CTL3            0xE24
+#define MSR_UNC_V3_C2_PMON_CTR0            0xE28
+#define MSR_UNC_V3_C2_PMON_CTR1            0xE29
+#define MSR_UNC_V3_C2_PMON_CTR2            0xE2A
+#define MSR_UNC_V3_C2_PMON_CTR3            0xE2B
+
+#define MSR_UNC_V3_C3_PMON_BOX_CTL         0xE30
+#define MSR_UNC_V3_C3_PMON_BOX_STATUS      0xE37
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER0     0xE35
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER1     0xE36
+#define MSR_UNC_V3_C3_PMON_CTL0            0xE31
+#define MSR_UNC_V3_C3_PMON_CTL1            0xE32
+#define MSR_UNC_V3_C3_PMON_CTL2            0xE33
+#define MSR_UNC_V3_C3_PMON_CTL3            0xE34
+#define MSR_UNC_V3_C3_PMON_CTR0            0xE38
+#define MSR_UNC_V3_C3_PMON_CTR1            0xE39
+#define MSR_UNC_V3_C3_PMON_CTR2            0xE3A
+#define MSR_UNC_V3_C3_PMON_CTR3            0xE3B
+
+#define MSR_UNC_V3_C4_PMON_BOX_CTL         0xE40
+#define MSR_UNC_V3_C4_PMON_BOX_STATUS      0xE47
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER0     0xE45
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER1     0xE46
+#define MSR_UNC_V3_C4_PMON_CTL0            0xE41
+#define MSR_UNC_V3_C4_PMON_CTL1            0xE42
+#define MSR_UNC_V3_C4_PMON_CTL2            0xE43
+#define MSR_UNC_V3_C4_PMON_CTL3            0xE44
+#define MSR_UNC_V3_C4_PMON_CTR0            0xE48
+#define MSR_UNC_V3_C4_PMON_CTR1            0xE49
+#define MSR_UNC_V3_C4_PMON_CTR2            0xE4A
+#define MSR_UNC_V3_C4_PMON_CTR3            0xE4B
+
+#define MSR_UNC_V3_C5_PMON_BOX_CTL         0xE50
+#define MSR_UNC_V3_C5_PMON_BOX_STATUS      0xE57
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER0     0xE55
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER1     0xE56
+#define MSR_UNC_V3_C5_PMON_CTL0            0xE51
+#define MSR_UNC_V3_C5_PMON_CTL1            0xE52
+#define MSR_UNC_V3_C5_PMON_CTL2            0xE53
+#define MSR_UNC_V3_C5_PMON_CTL3            0xE54
+#define MSR_UNC_V3_C5_PMON_CTR0            0xE58
+#define MSR_UNC_V3_C5_PMON_CTR1            0xE59
+#define MSR_UNC_V3_C5_PMON_CTR2            0xE5A
+#define MSR_UNC_V3_C5_PMON_CTR3            0xE5B
+
+#define MSR_UNC_V3_C6_PMON_BOX_CTL         0xE60
+#define MSR_UNC_V3_C6_PMON_BOX_STATUS      0xE67
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER0     0xE65
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER1     0xE66
+#define MSR_UNC_V3_C6_PMON_CTL0            0xE61
+#define MSR_UNC_V3_C6_PMON_CTL1            0xE62
+#define MSR_UNC_V3_C6_PMON_CTL2            0xE63
+#define MSR_UNC_V3_C6_PMON_CTL3            0xE64
+#define MSR_UNC_V3_C6_PMON_CTR0            0xE68
+#define MSR_UNC_V3_C6_PMON_CTR1            0xE69
+#define MSR_UNC_V3_C6_PMON_CTR2            0xE6A
+#define MSR_UNC_V3_C6_PMON_CTR3            0xE6B
+
+#define MSR_UNC_V3_C7_PMON_BOX_CTL         0xE70
+#define MSR_UNC_V3_C7_PMON_BOX_STATUS      0xE77
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER0     0xE75
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER1     0xE76
+#define MSR_UNC_V3_C7_PMON_CTL0            0xE71
+#define MSR_UNC_V3_C7_PMON_CTL1            0xE72
+#define MSR_UNC_V3_C7_PMON_CTL2            0xE73
+#define MSR_UNC_V3_C7_PMON_CTL3            0xE74
+#define MSR_UNC_V3_C7_PMON_CTR0            0xE78
+#define MSR_UNC_V3_C7_PMON_CTR1            0xE79
+#define MSR_UNC_V3_C7_PMON_CTR2            0xE7A
+#define MSR_UNC_V3_C7_PMON_CTR3            0xE7B
+
+#define MSR_UNC_V3_C8_PMON_BOX_CTL         0xE80
+#define MSR_UNC_V3_C8_PMON_BOX_STATUS      0xE87
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER0     0xE85
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER1     0xE86
+#define MSR_UNC_V3_C8_PMON_CTL0            0xE81
+#define MSR_UNC_V3_C8_PMON_CTL1            0xE82
+#define MSR_UNC_V3_C8_PMON_CTL2            0xE83
+#define MSR_UNC_V3_C8_PMON_CTL3            0xE84
+#define MSR_UNC_V3_C8_PMON_CTR0            0xE88
+#define MSR_UNC_V3_C8_PMON_CTR1            0xE89
+#define MSR_UNC_V3_C8_PMON_CTR2            0xE8A
+#define MSR_UNC_V3_C8_PMON_CTR3            0xE8B
+
+#define MSR_UNC_V3_C9_PMON_BOX_CTL         0xE90
+#define MSR_UNC_V3_C9_PMON_BOX_STATUS      0xE97
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER0     0xE95
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER1     0xE96
+#define MSR_UNC_V3_C9_PMON_CTL0            0xE91
+#define MSR_UNC_V3_C9_PMON_CTL1            0xE92
+#define MSR_UNC_V3_C9_PMON_CTL2            0xE93
+#define MSR_UNC_V3_C9_PMON_CTL3            0xE94
+#define MSR_UNC_V3_C9_PMON_CTR0            0xE98
+#define MSR_UNC_V3_C9_PMON_CTR1            0xE99
+#define MSR_UNC_V3_C9_PMON_CTR2            0xE9A
+#define MSR_UNC_V3_C9_PMON_CTR3            0xE9B
+
+#define MSR_UNC_V3_C10_PMON_BOX_CTL        0xEA0
+#define MSR_UNC_V3_C10_PMON_BOX_STATUS     0xEA7
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER0    0xEA5
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER1    0xEA6
+#define MSR_UNC_V3_C10_PMON_CTL0           0xEA1
+#define MSR_UNC_V3_C10_PMON_CTL1           0xEA2
+#define MSR_UNC_V3_C10_PMON_CTL2           0xEA3
+#define MSR_UNC_V3_C10_PMON_CTL3           0xEA4
+#define MSR_UNC_V3_C10_PMON_CTR0           0xEA8
+#define MSR_UNC_V3_C10_PMON_CTR1           0xEA9
+#define MSR_UNC_V3_C10_PMON_CTR2           0xEAA
+#define MSR_UNC_V3_C10_PMON_CTR3           0xEAB
+
+#define MSR_UNC_V3_C11_PMON_BOX_CTL        0xEB0
+#define MSR_UNC_V3_C11_PMON_BOX_STATUS     0xEB7
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER0    0xEB5
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER1    0xEB6
+#define MSR_UNC_V3_C11_PMON_CTL0           0xEB1
+#define MSR_UNC_V3_C11_PMON_CTL1           0xEB2
+#define MSR_UNC_V3_C11_PMON_CTL2           0xEB3
+#define MSR_UNC_V3_C11_PMON_CTL3           0xEB4
+#define MSR_UNC_V3_C11_PMON_CTR0           0xEB8
+#define MSR_UNC_V3_C11_PMON_CTR1           0xEB9
+#define MSR_UNC_V3_C11_PMON_CTR2           0xEBA
+#define MSR_UNC_V3_C11_PMON_CTR3           0xEBB
+
+#define MSR_UNC_V3_C12_PMON_BOX_CTL        0xEC0
+#define MSR_UNC_V3_C12_PMON_BOX_STATUS     0xEC7
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER0    0xEC5
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER1    0xEC6
+#define MSR_UNC_V3_C12_PMON_CTL0           0xEC1
+#define MSR_UNC_V3_C12_PMON_CTL1           0xEC2
+#define MSR_UNC_V3_C12_PMON_CTL2           0xEC3
+#define MSR_UNC_V3_C12_PMON_CTL3           0xEC4
+#define MSR_UNC_V3_C12_PMON_CTR0           0xEC8
+#define MSR_UNC_V3_C12_PMON_CTR1           0xEC9
+#define MSR_UNC_V3_C12_PMON_CTR2           0xECA
+#define MSR_UNC_V3_C12_PMON_CTR3           0xECB
+
+#define MSR_UNC_V3_C13_PMON_BOX_CTL        0xED0
+#define MSR_UNC_V3_C13_PMON_BOX_STATUS     0xED7
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER0    0xED5
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER1    0xED6
+#define MSR_UNC_V3_C13_PMON_CTL0           0xED1
+#define MSR_UNC_V3_C13_PMON_CTL1           0xED2
+#define MSR_UNC_V3_C13_PMON_CTL2           0xED3
+#define MSR_UNC_V3_C13_PMON_CTL3           0xED4
+#define MSR_UNC_V3_C13_PMON_CTR0           0xED8
+#define MSR_UNC_V3_C13_PMON_CTR1           0xED9
+#define MSR_UNC_V3_C13_PMON_CTR2           0xEDA
+#define MSR_UNC_V3_C13_PMON_CTR3           0xEDB
+
+#define MSR_UNC_V3_C14_PMON_BOX_CTL        0xEE0
+#define MSR_UNC_V3_C14_PMON_BOX_STATUS     0xEE7
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER0    0xEE5
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER1    0xEE6
+#define MSR_UNC_V3_C14_PMON_CTL0           0xEE1
+#define MSR_UNC_V3_C14_PMON_CTL1           0xEE2
+#define MSR_UNC_V3_C14_PMON_CTL2           0xEE3
+#define MSR_UNC_V3_C14_PMON_CTL3           0xEE4
+#define MSR_UNC_V3_C14_PMON_CTR0           0xEE8
+#define MSR_UNC_V3_C14_PMON_CTR1           0xEE9
+#define MSR_UNC_V3_C14_PMON_CTR2           0xEEA
+#define MSR_UNC_V3_C14_PMON_CTR3           0xEEB
+
+#define MSR_UNC_V3_C15_PMON_BOX_CTL        0xEF0
+#define MSR_UNC_V3_C15_PMON_BOX_STATUS     0xEF7
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER0    0xEF5
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER1    0xEF6
+#define MSR_UNC_V3_C15_PMON_CTL0           0xEF1
+#define MSR_UNC_V3_C15_PMON_CTL1           0xEF2
+#define MSR_UNC_V3_C15_PMON_CTL2           0xEF3
+#define MSR_UNC_V3_C15_PMON_CTL3           0xEF4
+#define MSR_UNC_V3_C15_PMON_CTR0           0xEF8
+#define MSR_UNC_V3_C15_PMON_CTR1           0xEF9
+#define MSR_UNC_V3_C15_PMON_CTR2           0xEFA
+#define MSR_UNC_V3_C15_PMON_CTR3           0xEFB
+
+#define MSR_UNC_V3_C16_PMON_BOX_CTL        0xF00
+#define MSR_UNC_V3_C16_PMON_BOX_STATUS     0xF07
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER0    0xF05
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER1    0xF06
+#define MSR_UNC_V3_C16_PMON_CTL0           0xF01
+#define MSR_UNC_V3_C16_PMON_CTL1           0xF02
+#define MSR_UNC_V3_C16_PMON_CTL2           0xF03
+#define MSR_UNC_V3_C16_PMON_CTL3           0xF04
+#define MSR_UNC_V3_C16_PMON_CTR0           0xF08
+#define MSR_UNC_V3_C16_PMON_CTR1           0xF09
+#define MSR_UNC_V3_C16_PMON_CTR2           0xF0A
+#define MSR_UNC_V3_C16_PMON_CTR3           0xF0B
+
+#define MSR_UNC_V3_C17_PMON_BOX_CTL        0xF10
+#define MSR_UNC_V3_C17_PMON_BOX_STATUS     0xF17
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER0    0xF15
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER1    0xF16
+#define MSR_UNC_V3_C17_PMON_CTL0           0xF11
+#define MSR_UNC_V3_C17_PMON_CTL1           0xF12
+#define MSR_UNC_V3_C17_PMON_CTL2           0xF13
+#define MSR_UNC_V3_C17_PMON_CTL3           0xF14
+#define MSR_UNC_V3_C17_PMON_CTR0           0xF18
+#define MSR_UNC_V3_C17_PMON_CTR1           0xF19
+#define MSR_UNC_V3_C17_PMON_CTR2           0xF1A
+#define MSR_UNC_V3_C17_PMON_CTR3           0xF1B
+
+#define MSR_UNC_V3_C18_PMON_BOX_CTL        0xF20
+#define MSR_UNC_V3_C18_PMON_BOX_STATUS     0xF27
+#define MSR_UNC_V3_C18_PMON_BOX_FILTER0    0xF25
+#define MSR_UNC_V3_C18_PMON_BOX_FILTER1    0xF26
+#define MSR_UNC_V3_C18_PMON_CTL0           0xF21
+#define MSR_UNC_V3_C18_PMON_CTL1           0xF22
+#define MSR_UNC_V3_C18_PMON_CTL2           0xF23
+#define MSR_UNC_V3_C18_PMON_CTL3           0xF24
+#define MSR_UNC_V3_C18_PMON_CTR0           0xF28
+#define MSR_UNC_V3_C18_PMON_CTR1           0xF29
+#define MSR_UNC_V3_C18_PMON_CTR2           0xF2A
+#define MSR_UNC_V3_C18_PMON_CTR3           0xF2B
+
+#define MSR_UNC_V3_C19_PMON_BOX_CTL        0xF30
+#define MSR_UNC_V3_C19_PMON_BOX_STATUS     0xF37
+#define MSR_UNC_V3_C19_PMON_BOX_FILTER0    0xF35
+#define MSR_UNC_V3_C19_PMON_BOX_FILTER1    0xF36
+#define MSR_UNC_V3_C19_PMON_CTL0           0xF31
+#define MSR_UNC_V3_C19_PMON_CTL1           0xF32
+#define MSR_UNC_V3_C19_PMON_CTL2           0xF33
+#define MSR_UNC_V3_C19_PMON_CTL3           0xF34
+#define MSR_UNC_V3_C19_PMON_CTR0           0xF38
+#define MSR_UNC_V3_C19_PMON_CTR1           0xF39
+#define MSR_UNC_V3_C19_PMON_CTR2           0xF3A
+#define MSR_UNC_V3_C19_PMON_CTR3           0xF3B
+
+#define MSR_UNC_V3_C20_PMON_BOX_CTL        0xF40
+#define MSR_UNC_V3_C20_PMON_BOX_STATUS     0xF47
+#define MSR_UNC_V3_C20_PMON_BOX_FILTER0    0xF45
+#define MSR_UNC_V3_C20_PMON_BOX_FILTER1    0xF46
+#define MSR_UNC_V3_C20_PMON_CTL0           0xF41
+#define MSR_UNC_V3_C20_PMON_CTL1           0xF42
+#define MSR_UNC_V3_C20_PMON_CTL2           0xF43
+#define MSR_UNC_V3_C20_PMON_CTL3           0xF44
+#define MSR_UNC_V3_C20_PMON_CTR0           0xF48
+#define MSR_UNC_V3_C20_PMON_CTR1           0xF49
+#define MSR_UNC_V3_C20_PMON_CTR2           0xF4A
+#define MSR_UNC_V3_C20_PMON_CTR3           0xF4B
+
+#define MSR_UNC_V3_C21_PMON_BOX_CTL        0xF50
+#define MSR_UNC_V3_C21_PMON_BOX_STATUS     0xF57
+#define MSR_UNC_V3_C21_PMON_BOX_FILTER0    0xF55
+#define MSR_UNC_V3_C21_PMON_BOX_FILTER1    0xF56
+#define MSR_UNC_V3_C21_PMON_CTL0           0xF51
+#define MSR_UNC_V3_C21_PMON_CTL1           0xF52
+#define MSR_UNC_V3_C21_PMON_CTL2           0xF53
+#define MSR_UNC_V3_C21_PMON_CTL3           0xF54
+#define MSR_UNC_V3_C21_PMON_CTR0           0xF58
+#define MSR_UNC_V3_C21_PMON_CTR1           0xF59
+#define MSR_UNC_V3_C21_PMON_CTR2           0xF5A
+#define MSR_UNC_V3_C21_PMON_CTR3           0xF5B
+
+#define MSR_UNC_V3_C22_PMON_BOX_CTL        0xF60
+#define MSR_UNC_V3_C22_PMON_BOX_STATUS     0xF67
+#define MSR_UNC_V3_C22_PMON_BOX_FILTER0    0xF65
+#define MSR_UNC_V3_C22_PMON_BOX_FILTER1    0xF66
+#define MSR_UNC_V3_C22_PMON_CTL0           0xF61
+#define MSR_UNC_V3_C22_PMON_CTL1           0xF62
+#define MSR_UNC_V3_C22_PMON_CTL2           0xF63
+#define MSR_UNC_V3_C22_PMON_CTL3           0xF64
+#define MSR_UNC_V3_C22_PMON_CTR0           0xF68
+#define MSR_UNC_V3_C22_PMON_CTR1           0xF69
+#define MSR_UNC_V3_C22_PMON_CTR2           0xF6A
+#define MSR_UNC_V3_C22_PMON_CTR3           0xF6B
+
+#define MSR_UNC_V3_C23_PMON_BOX_CTL        0xF70
+#define MSR_UNC_V3_C23_PMON_BOX_STATUS     0xF77
+#define MSR_UNC_V3_C23_PMON_BOX_FILTER0    0xF75
+#define MSR_UNC_V3_C23_PMON_BOX_FILTER1    0xF76
+#define MSR_UNC_V3_C23_PMON_CTL0           0xF71
+#define MSR_UNC_V3_C23_PMON_CTL1           0xF72
+#define MSR_UNC_V3_C23_PMON_CTL2           0xF73
+#define MSR_UNC_V3_C23_PMON_CTL3           0xF74
+#define MSR_UNC_V3_C23_PMON_CTR0           0xF78
+#define MSR_UNC_V3_C23_PMON_CTR1           0xF79
+#define MSR_UNC_V3_C23_PMON_CTR2           0xF7A
+#define MSR_UNC_V3_C23_PMON_CTR3           0xF7B
+
+/* Sbox */
+#define MSR_UNC_V3_S0_PMON_BOX_CTL         0x720
+#define MSR_UNC_V3_S0_PMON_BOX_STATUS      0x725
+#define MSR_UNC_V3_S0_PMON_CTL_0           0x721
+#define MSR_UNC_V3_S0_PMON_CTL_1           0x722
+#define MSR_UNC_V3_S0_PMON_CTL_2           0x723
+#define MSR_UNC_V3_S0_PMON_CTL_3           0x724
+#define MSR_UNC_V3_S0_PMON_CTR_0           0x726
+#define MSR_UNC_V3_S0_PMON_CTR_1           0x727
+#define MSR_UNC_V3_S0_PMON_CTR_2           0x728
+#define MSR_UNC_V3_S0_PMON_CTR_3           0x729
+
+#define MSR_UNC_V3_S1_PMON_BOX_CTL         0x72A
+#define MSR_UNC_V3_S1_PMON_BOX_STATUS      0x72F
+#define MSR_UNC_V3_S1_PMON_CTL_0           0x72B
+#define MSR_UNC_V3_S1_PMON_CTL_1           0x72C
+#define MSR_UNC_V3_S1_PMON_CTL_2           0x72D
+#define MSR_UNC_V3_S1_PMON_CTL_3           0x72E
+#define MSR_UNC_V3_S1_PMON_CTR_0           0x730
+#define MSR_UNC_V3_S1_PMON_CTR_1           0x731
+#define MSR_UNC_V3_S1_PMON_CTR_2           0x732
+#define MSR_UNC_V3_S1_PMON_CTR_3           0x733
+
+#define MSR_UNC_V3_S2_PMON_BOX_CTL         0x734
+#define MSR_UNC_V3_S2_PMON_BOX_STATUS      0x739
+#define MSR_UNC_V3_S2_PMON_CTL_0           0x735
+#define MSR_UNC_V3_S2_PMON_CTL_1           0x736
+#define MSR_UNC_V3_S2_PMON_CTL_2           0x737
+#define MSR_UNC_V3_S2_PMON_CTL_3           0x738
+#define MSR_UNC_V3_S2_PMON_CTR_0           0x73A
+#define MSR_UNC_V3_S2_PMON_CTR_1           0x73B
+#define MSR_UNC_V3_S2_PMON_CTR_2           0x73C
+#define MSR_UNC_V3_S2_PMON_CTR_3           0x73D
+
+#define MSR_UNC_V3_S3_PMON_BOX_CTL         0x73E
+#define MSR_UNC_V3_S3_PMON_BOX_STATUS      0x743
+#define MSR_UNC_V3_S3_PMON_CTL_0           0x73F
+#define MSR_UNC_V3_S3_PMON_CTL_1           0x740
+#define MSR_UNC_V3_S3_PMON_CTL_2           0x741
+#define MSR_UNC_V3_S3_PMON_CTL_3           0x742
+#define MSR_UNC_V3_S3_PMON_CTR_0           0x744
+#define MSR_UNC_V3_S3_PMON_CTR_1           0x745
+#define MSR_UNC_V3_S3_PMON_CTR_2           0x746
+#define MSR_UNC_V3_S3_PMON_CTR_3           0x747
+
+/* V3 HA similar to V1/V2 */
+/* V3 iMC similar to V1/V2 */
+
+
+/* PCU (Power Control) Performance Monitoring */
+
+#define MSR_UNC_V3_PCU_PMON_CTR0           0x717
+#define MSR_UNC_V3_PCU_PMON_CTR1           0x718
+#define MSR_UNC_V3_PCU_PMON_CTR2           0x719
+#define MSR_UNC_V3_PCU_PMON_CTR3           0x71A
+#define MSR_UNC_V3_PCU_PMON_CTL0           0x711
+#define MSR_UNC_V3_PCU_PMON_CTL1           0x712
+#define MSR_UNC_V3_PCU_PMON_CTL2           0x713
+#define MSR_UNC_V3_PCU_PMON_CTL3           0x714
+#define MSR_UNC_V3_PCU_PMON_BOX_FILTER     0x715
+#define MSR_UNC_V3_PCU_PMON_BOX_CTL        0x710
+#define MSR_UNC_V3_PCU_PMON_BOX_STATUS     0x716
+#define MSR_UNC_V3_PCU_CC6_CTR             0x3FD
+#define MSR_UNC_V3_PCU_CC3_CTR             0x3FC
+#define MSR_UNC_V3_PCU_PC2_CTR             0x60D
+#define MSR_UNC_V3_PCU_PC3_CTR             0x3F8
+
+/* V3 QPI Box Performance Monitoring, mostly similar to V1/V2 */
+
+#define PCI_UNC_V3_QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_V3_QPI_PMON_BOX_STATUS      0xF8
+#define PCI_UNC_V3_QPI_PMON_CTL_0           0xD8
+#define PCI_UNC_V3_QPI_PMON_CTL_1           0xDC
+#define PCI_UNC_V3_QPI_PMON_CTL_2           0xE0
+#define PCI_UNC_V3_QPI_PMON_CTL_3           0xE4
+#define PCI_UNC_V3_QPI_PMON_CTR_0_A         0xA4
+#define PCI_UNC_V3_QPI_PMON_CTR_1_A         0xAC
+#define PCI_UNC_V3_QPI_PMON_CTR_2_A         0xB4
+#define PCI_UNC_V3_QPI_PMON_CTR_3_A         0xBC
+#define PCI_UNC_V3_QPI_PMON_CTR_0_B         0xA0
+#define PCI_UNC_V3_QPI_PMON_CTR_1_B         0xA8
+#define PCI_UNC_V3_QPI_PMON_CTR_2_B         0xB0
+#define PCI_UNC_V3_QPI_PMON_CTR_3_B         0xB8
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_0          0x238
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_1          0x23C
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_0         0x228
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_1         0x22C
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_0          0x210
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_1          0x214
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_0         0x200
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_1         0x204
+#define PCI_UNC_V3_QPI_RATE_STATUS          0xD4
+#define PCI_UNC_V3_QPI_LINK_LLR             0xD0
+#define PCI_UNC_V3_QPI_LINK_IDLE            0xC8
+
+
+/* V3 R2PCIE Box Performance Monitoring similar to V1/V2 */
+
+/* V3 R3QPI Box Performance Monitoring similar to V1/V2 */
+
+/* ########################################################## */
 
 /* EX type uncore */
 /* U box - System Config Controller */
@@ -774,6 +1268,7 @@
 /* Match/Mask MSRs */
 #define MSR_B0_PMON_MATCH               0xE45
 #define MSR_B0_PMON_MASK                0xE46
+#define MSR_S0_PMON_MM_CFG              0xE49
 #define MSR_S0_PMON_MATCH               0xE49
 #define MSR_S0_PMON_MASK                0xE4A
 #define MSR_B1_PMON_MATCH               0xE4D
@@ -781,6 +1276,7 @@
 #define MSR_M0_PMON_MM_CONFIG           0xE54
 #define MSR_M0_PMON_ADDR_MATCH          0xE55
 #define MSR_M0_PMON_ADDR_MASK           0xE56
+#define MSR_S1_PMON_MM_CFG              0xE58
 #define MSR_S1_PMON_MATCH               0xE59
 #define MSR_S1_PMON_MASK                0xE5A
 #define MSR_M1_PMON_MM_CONFIG           0xE5C
@@ -803,6 +1299,11 @@
 #define MSR_DRAM_ENERGY_STATUS          0x619
 #define MSR_DRAM_PERF_STATUS            0x61B
 #define MSR_DRAM_POWER_INFO             0x61C
+#define MSR_PLATFORM_ENERGY_STATUS      0x64D
+#define MSR_PLATFORM_POWER_LIMIT        0x65C
+
+/* Intel Silvermont's RAPL registers */
+#define MSR_PKG_POWER_INFO_SILVERMONT   0x66E
 
 /* TM/TM2 interface */
 #define IA32_THERM_STATUS               0x19C
@@ -811,12 +1312,22 @@
 
 /* Turbo Boost Interface */
 #define MSR_IA32_MISC_ENABLE            0x1A0
+#define MSR_PREFETCH_ENABLE             0x1A4
 #define MSR_PLATFORM_INFO               0x0CE
 #define MSR_TURBO_POWER_CURRENT_LIMIT   0x1AC
 #define MSR_TURBO_RATIO_LIMIT           0x1AD
+#define MSR_TURBO_RATIO_LIMIT1          0x1AE
+#define MSR_TURBO_RATIO_LIMIT2          0x1AF
+#define MSR_TURBO_RATIO_LIMIT3          0x1AC
 
-/* Intel Silvermont's RAPL registers */
-#define MSR_PKG_POWER_INFO_SILVERMONT   0x66E
+/* MISC Intel register */
+#define MSR_MPERF                       0xE7
+#define MSR_APERF                       0xE8
+#define MSR_PPERF                       0x64E
+#define MSR_WEIGHTED_CORE_C0            0x658
+#define MSR_ANY_CORE_C0                 0x659
+#define MSR_ANY_GFXE_C0                 0x65A
+#define MSR_CORE_GFXE_OVERLAP_C0        0x65B
 /*
  * AMD
  */
@@ -866,6 +1377,15 @@
 #define MSR_AMD16_PMC2                  0xC0010006
 #define MSR_AMD16_PMC3                  0xC0010007
 
+#define MSR_AMD16_L2_PERFEVTSEL0        0xC0010230
+#define MSR_AMD16_L2_PERFEVTSEL1        0xC0010232
+#define MSR_AMD16_L2_PERFEVTSEL2        0xC0010234
+#define MSR_AMD16_L2_PERFEVTSEL3        0xC0010236
+#define MSR_AMD16_L2_PMC0               0xC0010231
+#define MSR_AMD16_L2_PMC1               0xC0010233
+#define MSR_AMD16_L2_PMC2               0xC0010235
+#define MSR_AMD16_L2_PMC3               0xC0010237
+
 #define MSR_AMD16_NB_PERFEVTSEL0        0xC0010240
 #define MSR_AMD16_NB_PERFEVTSEL1        0xC0010242
 #define MSR_AMD16_NB_PERFEVTSEL2        0xC0010244
diff --git a/src/includes/registers_types.h b/src/includes/registers_types.h
new file mode 100644
index 0000000..e588e3e
--- /dev/null
+++ b/src/includes/registers_types.h
@@ -0,0 +1,209 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  registers_types.h
+ *
+ *      Description:  Header File of registers.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef REGISTERS_TYPES_H
+#define REGISTERS_TYPES_H
+
+#include <pci_types.h>
+
+typedef enum {
+    PMC0 = 0,
+    PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
+    PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
+    PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
+    PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
+    PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
+    PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
+    PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
+    PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
+    PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
+    PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
+    PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
+    PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
+    PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
+    PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
+    PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
+    PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
+    PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
+    PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
+    PMC109, PMC110, PMC111, PMC112, PMC113, PMC114,
+    PMC115, PMC116, PMC117, PMC118, PMC119, PMC120,
+    PMC121, PMC122, PMC123, PMC124, PMC125, PMC126,
+    PMC127, PMC128, PMC129, PMC130, PMC131, PMC132,
+    PMC133, PMC134, PMC135, PMC136, PMC137, PMC138,
+    PMC139, PMC140, PMC141, PMC142, PMC143, PMC144,
+    PMC145, PMC146, PMC147, PMC148, PMC149, PMC150,
+    PMC151, PMC152, PMC153, PMC154, PMC155, PMC156,
+    PMC157, PMC158, PMC159, PMC160, PMC161, PMC162,
+    PMC163, PMC164, PMC165, PMC166, PMC167, PMC168,
+    PMC169, PMC170, PMC171, PMC172, PMC173, PMC174,
+    PMC175, PMC176, PMC177, PMC178, PMC179, PMC180,
+    PMC181, PMC182, PMC183, PMC184, PMC185, PMC186,
+    PMC187, PMC188, PMC189, PMC190, PMC191, PMC192,
+    PMC193, PMC194, PMC195, PMC196, PMC197, PMC198,
+    PMC199, PMC200, PMC201, PMC202, PMC203, PMC204,
+    PMC205, PMC206, PMC207, PMC208, PMC209, PMC210,
+    PMC211, PMC212, PMC213, PMC214, PMC215, PMC216,
+    NUM_PMC
+} RegisterIndex;
+
+typedef enum {
+    PMC = 0, FIXED, THERMAL,
+    POWER, UNCORE, MBOX0,
+    MBOX1, MBOX2, MBOX3,
+    MBOX4, MBOX5, MBOX6, MBOX7,
+    MBOX0FIX, MBOX1FIX, MBOX2FIX,
+    MBOX3FIX, MBOX4FIX, MBOX5FIX,
+    MBOX6FIX, MBOX7FIX,
+    BBOX0, BBOX1,
+    RBOX0, RBOX1, RBOX2,
+    WBOX,
+    WBOX0FIX, WBOX1FIX,
+    SBOX0, SBOX1, SBOX2, SBOX3,
+    SBOX0FIX, SBOX1FIX, SBOX2FIX, SBOX3FIX,
+    CBOX0, CBOX1, CBOX2,
+    CBOX3, CBOX4, CBOX5,
+    CBOX6, CBOX7, CBOX8,
+    CBOX9, CBOX10, CBOX11,
+    CBOX12, CBOX13, CBOX14,
+    CBOX15, CBOX16, CBOX17,
+    CBOX18, CBOX19, CBOX20,
+    CBOX21, CBOX22, CBOX23,
+    PBOX,
+    UBOX,
+    UBOXFIX,
+    IBOX0, IBOX1,
+    QBOX0, QBOX1, QBOX2,
+    QBOX0FIX, QBOX1FIX, QBOX2FIX,
+    NUM_UNITS, NOTYPE, MAX_UNITS
+} RegisterType;
+
+static char* RegisterTypeNames[MAX_UNITS] = {
+    [PMC] = "Core-local general purpose counters",
+    [FIXED] = "Fixed counters",
+    [THERMAL] = "Thermal",
+    [POWER] = "Energy/Power counters (RAPL)",
+    [UNCORE] = "Socket-local general/fixed purpose counters",
+    [MBOX0] = "Memory Controller 0 Channel 0",
+    [MBOX1] = "Memory Controller 0 Channel 1",
+    [MBOX2] = "Memory Controller 0 Channel 2",
+    [MBOX3] = "Memory Controller 0 Channel 3",
+    [MBOX4] = "Memory Controller 1 Channel 0",
+    [MBOX5] = "Memory Controller 1 Channel 1",
+    [MBOX6] = "Memory Controller 1 Channel 2",
+    [MBOX7] = "Memory Controller 1 Channel 3",
+    [MBOX0FIX] = "Memory Controller 0 Channel 0 Fixed Counter",
+    [MBOX1FIX] = "Memory Controller 0 Channel 1 Fixed Counter",
+    [MBOX2FIX] = "Memory Controller 0 Channel 2 Fixed Counter",
+    [MBOX3FIX] = "Memory Controller 0 Channel 3 Fixed Counter",
+    [MBOX4FIX] = "Memory Controller 1 Channel 0 Fixed Counter",
+    [MBOX5FIX] = "Memory Controller 1 Channel 1 Fixed Counter",
+    [MBOX6FIX] = "Memory Controller 1 Channel 2 Fixed Counter",
+    [MBOX7FIX] = "Memory Controller 1 Channel 3 Fixed Counter",
+    [BBOX0] = "Home Agent box 0",
+    [BBOX1] = "Home Agent box 1",
+    [RBOX0] = "Routing box 0",
+    [RBOX1] = "Routing box 1",
+    [RBOX2] = "Routing box 2",
+    [WBOX] = "Power control box",
+    [WBOX0FIX] = "Power control box fixed counter 0",
+    [WBOX1FIX] = "Power control box fixed counter 1",
+    [SBOX0] = "QPI Link Layer box 0",
+    [SBOX1] = "QPI Link Layer box 1",
+    [SBOX2] = "QPI Link Layer box 2",
+    [SBOX3] = "QPI Link Layer box 3",
+    [SBOX0FIX] = "QPI Link Layer box fixed 0",
+    [SBOX1FIX] = "QPI Link Layer box fixed 1",
+    [SBOX2FIX] = "QPI Link Layer box fixed 2",
+    [SBOX3FIX] = "QPI Link Layer box fixed 3",
+    [CBOX0] = "Caching Agent box 0",
+    [CBOX1] = "Caching Agent box 1",
+    [CBOX2] = "Caching Agent box 2",
+    [CBOX3] = "Caching Agent box 3",
+    [CBOX4] = "Caching Agent box 4",
+    [CBOX5] = "Caching Agent box 5",
+    [CBOX6] = "Caching Agent box 6",
+    [CBOX7] = "Caching Agent box 7",
+    [CBOX8] = "Caching Agent box 8",
+    [CBOX9] = "Caching Agent box 9",
+    [CBOX10] = "Caching Agent box 10",
+    [CBOX11] = "Caching Agent box 11",
+    [CBOX12] = "Caching Agent box 12",
+    [CBOX13] = "Caching Agent box 13",
+    [CBOX14] = "Caching Agent box 14",
+    [CBOX15] = "Caching Agent box 15",
+    [CBOX16] = "Caching Agent box 16",
+    [CBOX17] = "Caching Agent box 17",
+    [CBOX18] = "Caching Agent box 18",
+    [CBOX19] = "Caching Agent box 19",
+    [CBOX20] = "Caching Agent box 20",
+    [CBOX21] = "Caching Agent box 21",
+    [CBOX22] = "Caching Agent box 22",
+    [CBOX23] = "Caching Agent box 23",
+    [PBOX] = "Physical Layer box",
+    [UBOX] = "System Configuration box",
+    [UBOXFIX] = "System Configuration box fixed counter",
+    [IBOX0] = "Coherency Maintainer for IIO traffic",
+    [IBOX1] = "Coherency Maintainer for IIO traffic",
+    [QBOX0] = "QPI Link Layer 0",
+    [QBOX1] = "QPI Link Layer 1",
+    [QBOX0FIX] = "QPI Link Layer rate status 0",
+    [QBOX1FIX] = "QPI Link Layer rate status 1",
+    [NUM_UNITS] = "Maximally usable register types",
+    [NOTYPE] = "No Type, used for skipping unavailable counters"
+};
+
+#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (0x1ULL<<type) : 0x0ULL)
+
+typedef struct {
+    char*               key;
+    RegisterIndex       index;
+    RegisterType        type;
+    uint64_t            configRegister;
+    uint64_t            counterRegister;
+    uint64_t            counterRegister2;
+    PciDeviceIndex      device;
+    uint64_t            optionMask;
+} RegisterMap;
+
+typedef struct {
+    uint32_t  ctrlRegister;
+    uint32_t  statusRegister;
+    uint32_t  ovflRegister;
+    int       ovflOffset;
+    uint8_t   isPci;
+    PciDeviceIndex device;
+    uint32_t  regWidth;
+    uint32_t  filterRegister1;
+    uint32_t  filterRegister2;
+} BoxMap;
+
+#endif
diff --git a/src/includes/strUtil.h b/src/includes/strUtil.h
deleted file mode 100644
index 18236b6..0000000
--- a/src/includes/strUtil.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil.h
- *
- *      Description:  Header File strUtil Module. 
- *                    Helper routines for bstrlib and command line parsing
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_H
-#define STRUTIL_H
-
-#include <bstrlib.h>
-#include <types.h>
-#include <time.h>
-
-#define CHECK_OPTION_STRING  \
-if (! (argString = bSecureInput(400,optarg))) {  \
-    ERROR_PLAIN_PRINT(Failed to read argument string!);  \
-}
-
-extern int str2int(const char* str);
-extern uint32_t bstr_to_cpuset_physical(uint32_t* threads,  const_bstring q);
-extern int bstr_to_cpuset(int* threads,  const_bstring str);
-extern void bstr_to_eventset(StrUtilEventSet* set, const_bstring str);
-extern bstring bSecureInput (int maxlen, char* vgcCtx);
-extern int bJustifyCenter (bstring b, int width);
-extern void bstr_to_workgroup(Workgroup* threads,  const_bstring str, DataType type, int numberOfStreams);
-extern FILE* bstr_to_outstream(const_bstring argString, bstring filter);
-extern uint64_t bstr_to_doubleSize(const_bstring str, DataType type);
-extern void bstr_to_interval(const_bstring str, struct timespec* interval);
-
-#endif /*STRUTIL_H*/
diff --git a/src/includes/strUtil_types.h b/src/includes/strUtil_types.h
deleted file mode 100644
index 25766ff..0000000
--- a/src/includes/strUtil_types.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil_types.h
- *
- *      Description:  Types file for strUtil module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_TYPES_H
-#define STRUTIL_TYPES_H
-
-#include  <bstrlib.h>
-
-
-typedef struct {
-    bstring eventName;
-    bstring counterName;
-} StrUtilEvent;
-
-typedef struct {
-    StrUtilEvent* events;
-    int numberOfEvents;
-} StrUtilEventSet;
-
-typedef struct {
-    bstring domain;
-    int offset;
-    void* ptr;
-} Stream;
-
-typedef struct {
-    uint32_t numberOfThreads;
-    int* processorIds;
-    uint64_t size;
-    Stream* streams;
-} Workgroup;
-
-
-#endif /*STRUTIL_TYPES_H*/
diff --git a/src/includes/test_types.h b/src/includes/test_types.h
deleted file mode 100644
index 45c0932..0000000
--- a/src/includes/test_types.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  test_types.h
- *
- *      Description:  Type definitions for benchmarking framework
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef TEST_TYPES_H
-#define TEST_TYPES_H
-
-#include <stdint.h>
-#include <bstrlib.h>
-
-typedef void (*FuncPrototype)();
-
-typedef enum {
-    SINGLE = 0,
-    DOUBLE,
-    SINGLE_RAND,
-    DOUBLE_RAND
-} DataType;
-
-typedef enum {
-    STREAM_1 = 1,
-    STREAM_2,
-    STREAM_3,
-    STREAM_4,
-    STREAM_5,
-    STREAM_6,
-    STREAM_7,
-    STREAM_8,
-    STREAM_9,
-    STREAM_10,
-    STREAM_11,
-    STREAM_12,
-    STREAM_13,
-    STREAM_14,
-    STREAM_15,
-    STREAM_16,
-    STREAM_17,
-    STREAM_18,
-    STREAM_19,
-    STREAM_20,
-    STREAM_21,
-    STREAM_22,
-    STREAM_23,
-    STREAM_24,
-    STREAM_25,
-    STREAM_26,
-    STREAM_27,
-    STREAM_28,
-    STREAM_29,
-    STREAM_30,
-    STREAM_31,
-    STREAM_32,
-    STREAM_33,
-    STREAM_34,
-    STREAM_35,
-    STREAM_36,
-    STREAM_37,
-    STREAM_38,
-    MAX_STREAMS
-} Pattern;
-
-typedef struct {
-    char* name;
-    Pattern streams;
-    DataType type ;
-    int stride;
-    FuncPrototype kernel;
-    double flops;
-    int bytes;
-} TestCase;
-
-typedef struct {
-    uint64_t size;
-    uint32_t iter;
-    const TestCase* test;
-    uint64_t cycles;
-    uint32_t numberOfThreads;
-    int* processors;
-    void** streams;
-} ThreadUserData;
-
-#endif /*TEST_TYPES_H*/
diff --git a/src/includes/textcolor.h b/src/includes/textcolor.h
index 4c1b7b1..d0a3e10 100644
--- a/src/includes/textcolor.h
+++ b/src/includes/textcolor.h
@@ -7,13 +7,13 @@
  *                    Allows toggling of terminal escape sequences for 
  *                    colored text.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/thermal.h b/src/includes/thermal.h
index 3153386..ac37261 100644
--- a/src/includes/thermal.h
+++ b/src/includes/thermal.h
@@ -6,13 +6,13 @@
  *      Description:  Header File Thermal Module.
  *                    Implements Intel TM/TM2 Interface.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,20 +34,43 @@
 #include <types.h>
 #include <registers.h>
 #include <bitUtil.h>
-#include <msr.h>
+#include <error.h>
+#include <access.h>
 
-extern ThermalInfo thermal_info;
 
-extern void thermal_init(int cpuId);
-static inline uint32_t thermal_read(int cpuId);
 
-static uint32_t
-thermal_read(int cpuId)
+int
+thermal_read(int cpuId, uint32_t *data)
 {
-    uint32_t readout = extractBitField(msr_read(cpuId, IA32_THERM_STATUS),7,16);
-    return (readout == 0 ? 
-            thermal_info.activationT - thermal_info.offset :
-            (thermal_info.activationT-thermal_info.offset) - readout );
+    uint64_t result = 0;
+    uint32_t readout = 0;
+    if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+    {
+        *data = 0;
+        return -EIO;
+    }
+    readout = extractBitField(result,7,16);
+    *data = (readout == 0 ?
+                thermal_info.activationT - thermal_info.offset :
+                (thermal_info.activationT - thermal_info.offset) - readout );
+    return 0;
+}
+
+int
+thermal_tread(int socket_fd, int cpuId, uint32_t *data)
+{
+    uint64_t result = 0;
+    uint32_t readout = 0;
+    if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+    {
+        *data = 0;
+        return -EIO;
+    }
+    readout = extractBitField(result,7,16);
+    *data = (readout == 0 ?
+                thermal_info.activationT - thermal_info.offset :
+                (thermal_info.activationT - thermal_info.offset) - readout );
+    return 0;
 }
 
 #endif /*THERMAL_H*/
diff --git a/src/includes/thermal_types.h b/src/includes/thermal_types.h
index a619180..feb17fa 100644
--- a/src/includes/thermal_types.h
+++ b/src/includes/thermal_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for thermal module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +33,9 @@
 
 #include <stdint.h>
 
-
+/** \addtogroup ThermalMon
+ *  @{
+ */
 typedef struct {
     uint16_t highT;
     uint32_t resolution;
@@ -41,5 +43,11 @@ typedef struct {
     uint32_t offset;
 } ThermalInfo;
 
+/** \brief Pointer for exporting the ThermalInfo data structure */
+typedef ThermalInfo* ThermalInfo_t;
+/** @}*/
+
+extern ThermalInfo thermal_info;
+
 
 #endif /*THERMAL_TYPES_H*/
diff --git a/src/includes/threads.h b/src/includes/threads.h
deleted file mode 100644
index 6e00191..0000000
--- a/src/includes/threads.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads.h
- *
- *      Description:  Header file of pthread interface module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_H
-#define THREADS_H
-
-#include <types.h>
-#include <pthread.h>
-#include <threads_types.h>
-#include <stdio.h>
-
-#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
-
-extern pthread_barrier_t threads_barrier;
-extern ThreadData* threads_data;
-extern ThreadGroup* threads_groups;
-
-
-/**
- * @brief  Initialization of the thread module
- * @param  numberOfThreads  The total number of threads
- */
-extern void threads_init(FILE* OUTSTREAM, int numberOfThreads);
-
-/**
- * @brief  Create all threads
- * @param  startRoutine thread entry function pointer
- */
-extern void threads_create(void *(*startRoutine)(void*));
-
-/**
- * @brief  Register User thread data for all threads
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataAll(
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Register User thread data for one thread
- * @param  threadId thread Id 
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataThread(
-        int threadId,
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Register User thread data for a thread group
- * @param  groupId  group Id
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataGroup(
-        int groupId,
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Join the threads and free pthread related data structures
- * @param
- */
-extern void threads_join(void);
-
-/**
- * @brief  Free memory of thread data structures
- * @param  numberOfGroups The number of groups to destroy
- */
-extern void threads_destroy(int numberOfGroups);
-
-/**
- * @brief  Create Thread groups
- * @param  numberOfGroups The number of groups to create
- */
-extern void threads_createGroups(int numberOfGroups);
-
-#endif /* THREADS_H */
diff --git a/src/includes/threads_types.h b/src/includes/threads_types.h
deleted file mode 100644
index dfa13f3..0000000
--- a/src/includes/threads_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads_types.h
- *
- *      Description:  Types file for threads module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_TYPES_H
-#define THREADS_TYPES_H
-
-#include <stdio.h>
-#include <stdint.h>
-
-typedef struct {
-    int globalNumberOfThreads;
-    int numberOfThreads;
-    int globalThreadId;
-    int threadId;
-    int numberOfGroups;
-    int groupId;
-    double time;
-    uint64_t cycles;
-    FILE* output;
-    ThreadUserData data;
-} ThreadData;
-
-typedef struct {
-    int numberOfThreads;
-    int* threadIds;
-} ThreadGroup;
-
-typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
-
-#endif /*THREADS_TYPES_H*/
diff --git a/src/includes/timer.h b/src/includes/timer.h
index b97f4ac..a7ea870 100644
--- a/src/includes/timer.h
+++ b/src/includes/timer.h
@@ -10,13 +10,13 @@
  *      with rdtsc of 100 cycles in the worst case. Therefore sensible
  *      measurements should be over 1000 cycles.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,80 +37,20 @@
 
 #include <types.h>
 
-#define RDTSC(cpu_c) \
-    __asm__ volatile("xor %%eax,%%eax\n\t" \
-                     "cpuid\n\t"           \
-                     "rdtsc\n\t"           \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSC_CR(cpu_c) \
-    __asm__ volatile("rdtsc\n\t"           \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSCP(cpu_c) \
-    __asm__ volatile("rdtscp\n\t"          \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-                     "cpuid\n\t"           \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#ifdef HAS_RDTSCP
-#define RDTSC_STOP(cpu_c) RDTSCP(cpu_c);
-#else
-#define RDTSC_STOP(cpu_c) RDTSC_CR(cpu_c);
-#endif
-
 
 extern void timer_init( void );
 extern double timer_print( TimerData* );
 extern uint64_t timer_printCycles( TimerData* );
 extern uint64_t timer_getCpuClock( void );
+extern uint64_t timer_getCpuClockCurrent( int cpu_id );
+extern uint64_t timer_getCycleClock( void );
 extern uint64_t timer_getBaseline( void );
 
-static inline void timer_start( TimerData* );
-static inline void timer_stop ( TimerData* );
-
-void timer_start( TimerData* time )
-{
-#ifdef __x86_64
-    RDTSC(time->start);
-#endif
-#ifdef _ARCH_PPC
-    uint32_t tbl, tbu0, tbu1;
-
-    do {
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-    } while (tbu0 != tbu1);
+extern void timer_start( TimerData* );
+extern void timer_stop ( TimerData* );
 
-    time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
 
-void timer_stop( TimerData* time )
-{
-#ifdef __x86_64
-    RDTSC_STOP(time->stop)
-#endif
-#ifdef _ARCH_PPC
-    uint32_t tbl, tbu0, tbu1;
-    do {
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-    } while (tbu0 != tbu1);
 
-    time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
 
 
 #endif /* TIMER_H */
diff --git a/src/includes/timer_types.h b/src/includes/timer_types.h
index 265d5c9..2dac362 100644
--- a/src/includes/timer_types.h
+++ b/src/includes/timer_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for timer module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/tlb-info.h b/src/includes/tlb-info.h
new file mode 100644
index 0000000..1f322c9
--- /dev/null
+++ b/src/includes/tlb-info.h
@@ -0,0 +1,89 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  tlb-info.h
+ *
+ *      Description:  Header File of topology module that contains the TLB
+ *                    describing strings. Not used currently.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef TLB_INFO_H
+#define TLB_INFO_H
+
+static char* intel_tlb_info[256] = {
+    [0] = NULL,
+    [1] = "Instruction TLB: 4 KByte pages, 4-way set associative, 32 entries",
+    [2] = "Instruction TLB: 4 MByte pages, fully associative, 2 entries",
+    [3] = "Data TLB: 4 KByte pages, 4-way set associative, 64 entries",
+    [4] = "Data TLB: 4 MByte pages, 4-way set associative, 8 entries",
+    [5] = "Data TLB1: 4 MByte pages, 4-way set associative, 32 entries",
+    [6 ... 10] = NULL,
+    [11] = "Instruction TLB: 4 MByte pages, 4-way set associative, 4 entries",
+    [12 ... 78] = NULL,
+    [79] = "Instruction TLB: 4 KByte pages, 32 entries",
+    [80] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 64 entries",
+    [81] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 128 entries",
+    [82] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 256 entries",
+    [83 ... 84] = NULL,
+    [85] = "Instruction TLB: 2-MByte or 4-MByte pages, fully associative, 7 entries",
+    [86] = "Data TLB0: 4 MByte pages, 4-way set associative, 16 entries",
+    [87] = "Data TLB0: 4 KByte pages, 4-way associative, 16 entries",
+    [88] = NULL,
+    [89] = "Data TLB0: 4 KByte pages, fully associative, 16 entries",
+    [90] = "Data TLB0: 2-MByte or 4 MByte pages, 4-way set associative, 32 entries",
+    [91] = "Data TLB: 4 KByte and 4 MByte pages, 64 entries",
+    [92] = "Data TLB: 4 KByte and 4 MByte pages,128 entries",
+    [93] = "Data TLB: 4 KByte and 4 MByte pages,256 entries",
+    [94 ... 96] = NULL,
+    [97] = "Instruction TLB: 4 KByte pages, fully associative, 48 entries",
+    [98] = NULL,
+    [99] = "Data TLB: 1 GByte pages, 4-way set associative, 4 entries",
+    [100 ... 117] = NULL,
+    [118] = "Instruction TLB: 2M/4M pages, fully associative, 8 entries",
+    [119 ... 159] = NULL,
+    [160] = "DTLB: 4k pages, fully associative, 32 entries",
+    [161 ... 175] = NULL,
+    [176] = "Instruction TLB: 4 KByte pages, 4-way set associative, 128 entries",
+    [177] = "Instruction TLB: 2M pages, 4-way, 8 entries or 4M pages, 4-way, 4 entries",
+    [178] = "Instruction TLB: 4KByte pages, 4-way set associative, 64 entries",
+    [179] = "Data TLB: 4 KByte pages, 4-way set associative, 128 entries",
+    [180] = "Data TLB1: 4 KByte pages, 4-way associative, 256 entries",
+    [181] = "Instruction TLB: 4KByte pages, 8-way set associative, 64 entries",
+    [182] = "Instruction TLB: 4KByte pages, 8-way set associative, 128 entries",
+    [183 ... 185] = NULL,
+    [186] = "Data TLB1: 4 KByte pages, 4-way associative, 64 entries",
+    [187 ... 191] = NULL,
+    [192] = "Data TLB: 4 KByte and 4 MByte pages, 4-way associative, 8 entries",
+    [193] = "Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries",
+    [194] = "DTLB: 4 KByte/2 MByte pages, 4-way associative, 16 entries",
+    [195 ... 201] = NULL,
+    [202] = "Shared 2nd-Level TLB: 4 KByte pages, 4-way associative, 512 entries",
+    [203 ... 239] = NULL,
+    [240] = "64-Byte prefetching",
+    [241] = "128-Byte prefetching",
+    [242 ... 255] = NULL
+}; 
+#endif
diff --git a/src/includes/topology.h b/src/includes/topology.h
new file mode 100644
index 0000000..77129fb
--- /dev/null
+++ b/src/includes/topology.h
@@ -0,0 +1,144 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology.h
+ *
+ *      Description:  Header File of topology module.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY
+#define LIKWID_TOPOLOGY
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <topology_cpuid.h>
+#include <topology_proc.h>
+#ifdef LIKWID_USE_HWLOC
+#include <topology_hwloc.h>
+#endif
+#include <types.h>
+#include <tree.h>
+
+
+#define MAX_FEATURE_STRING_LENGTH 512
+#define MAX_MODEL_STRING_LENGTH 512
+
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+struct topology_functions {
+    void (*init_cpuInfo) (cpu_set_t cpuSet);
+    void (*init_cpuFeatures) (void);
+    void (*init_nodeTopology) (cpu_set_t cpuSet);
+    void (*init_cacheTopology) (void);
+    void (*init_fileTopology) (FILE*);
+    void (*close_topology) (void);
+};
+
+/* Intel P6 */
+#define PENTIUM_M_BANIAS     0x09U
+#define PENTIUM_M_DOTHAN     0x0DU
+#define CORE_DUO             0x0EU
+#define CORE2_65             0x0FU
+#define CORE2_45             0x17U
+#define ATOM                 0x1CU
+#define ATOM_45              0x26U
+#define ATOM_32              0x36U
+#define ATOM_22              0x27U
+#define ATOM_SILVERMONT_E    0x37U
+#define ATOM_SILVERMONT_C    0x4DU
+#define ATOM_SILVERMONT_Z1   0x4AU
+#define ATOM_SILVERMONT_Z2   0x5AU
+#define ATOM_SILVERMONT_F    0x5DU
+#define ATOM_SILVERMONT_AIR  0x4CU
+#define ATOM_SILVERMONT_GOLD 0x5CU
+#define NEHALEM              0x1AU
+#define NEHALEM_BLOOMFIELD   0x1AU
+#define NEHALEM_LYNNFIELD    0x1EU
+#define NEHALEM_LYNNFIELD_M  0x1FU
+#define NEHALEM_WESTMERE     0x2CU
+#define NEHALEM_WESTMERE_M   0x25U
+#define SANDYBRIDGE          0x2AU
+#define SANDYBRIDGE_EP       0x2DU
+#define HASWELL              0x3CU
+#define HASWELL_EP           0x3FU
+#define HASWELL_M1           0x45U
+#define HASWELL_M2           0x46U
+#define IVYBRIDGE            0x3AU
+#define IVYBRIDGE_EP         0x3EU
+#define NEHALEM_EX           0x2EU
+#define WESTMERE_EX          0x2FU
+#define XEON_MP              0x1DU
+#define BROADWELL            0x3DU
+#define BROADWELL_E          0x4FU
+#define BROADWELL_D          0x56U
+#define SKYLAKE1             0x4EU
+#define SKYLAKE2             0x5EU
+
+/* Intel MIC */
+#define XEON_PHI           0x01U
+#define XEON_PHI2          0x57U
+
+/* AMD K10 */
+#define BARCELONA      0x02U
+#define SHANGHAI       0x04U
+#define ISTANBUL       0x08U
+#define MAGNYCOURS     0x09U
+
+/* AMD K8 */
+#define OPTERON_SC_1MB  0x05U
+#define OPTERON_DC_E    0x21U
+#define OPTERON_DC_F    0x41U
+#define ATHLON64_X2     0x43U
+#define ATHLON64_X2_F   0x4BU
+#define ATHLON64_F1     0x4FU
+#define ATHLON64_F2     0x5FU
+#define ATHLON64_X2_G   0x6BU
+#define ATHLON64_G1     0x6FU
+#define ATHLON64_G2     0x7FU
+
+
+#define  P6_FAMILY        0x6U
+#define  MIC_FAMILY       0xBU
+#define  NETBURST_FAMILY  0xFFU
+#define  K15_FAMILY       0x15U
+#define  K16_FAMILY       0x16U
+#define  K10_FAMILY       0x10U
+#define  K8_FAMILY        0xFU
+
+
+
+
+
+extern int cpu_count(cpu_set_t* set);
+
+static inline int cpuid_hasFeature(FeatureBit bit)
+{
+      return (cpuid_info.featureFlags & (1<<bit));
+}
+
+
+#endif
diff --git a/src/includes/topology_cpuid.h b/src/includes/topology_cpuid.h
new file mode 100644
index 0000000..9e39641
--- /dev/null
+++ b/src/includes/topology_cpuid.h
@@ -0,0 +1,43 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_cpuid.h
+ *
+ *      Description:  Header File of topology backend using cpuid instruction.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_CPUID
+#define LIKWID_TOPOLOGY_CPUID
+
+#include <sched.h>
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet);
+void cpuid_init_cpuFeatures(void);
+void cpuid_init_nodeTopology(cpu_set_t cpuSet);
+void cpuid_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_hwloc.h b/src/includes/topology_hwloc.h
new file mode 100644
index 0000000..4595a08
--- /dev/null
+++ b/src/includes/topology_hwloc.h
@@ -0,0 +1,52 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_hwloc.h
+ *
+ *      Description:  Header File of topology backend using the hwloc library
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_HWLOC
+#define LIKWID_TOPOLOGY_HWLOC
+
+
+#include <hwloc.h>
+#include <sched.h>
+
+
+extern hwloc_topology_t hwloc_topology;
+
+int likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list);
+
+
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet);
+void hwloc_init_cpuFeatures(void);
+void hwloc_init_nodeTopology(cpu_set_t cpuSet);
+void hwloc_init_cacheTopology(void);
+void hwloc_close(void);
+
+
+#endif
diff --git a/src/includes/topology_proc.h b/src/includes/topology_proc.h
new file mode 100644
index 0000000..1efd81b
--- /dev/null
+++ b/src/includes/topology_proc.h
@@ -0,0 +1,51 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_proc.h
+ *
+ *      Description:  Header File of topology backend using procfs/sysfs
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_PROC
+#define LIKWID_TOPOLOGY_PROC
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+#include <topology.h>
+
+void proc_init_cpuInfo(cpu_set_t cpuSet);
+void proc_init_cpuFeatures(void);
+void proc_init_nodeTopology(cpu_set_t cpuSet);
+void proc_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_types.h b/src/includes/topology_types.h
new file mode 100644
index 0000000..82cf954
--- /dev/null
+++ b/src/includes/topology_types.h
@@ -0,0 +1,73 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_types.h
+ *
+ *      Description:  Types file for topology module. External definitions are
+ *                    in likwid.h
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CPUID_TYPES_H
+#define CPUID_TYPES_H
+
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Enum of possible CPU features
+
+CPUs implement different features that likely improve application performance if
+optimized using the feature. The list contains all features that are currently 
+supported by LIKWID. LIKWID does not perform any action based on these features,
+it gathers the data only for output purposes. It is not a complete list.
+\extends CpuInfo
+*/
+typedef enum {
+    SSE3=0, /*!< \brief Streaming SIMD Extensions 3 */
+    MMX, /*!< \brief Multi Media Extension */
+    SSE, /*!< \brief Streaming SIMD Extensions */
+    SSE2, /*!< \brief Streaming SIMD Extensions 2 */
+    MONITOR, /*!< \brief MONITOR and MWAIT instructions (part of SSE3) */
+    ACPI, /*!< \brief Advanced Configuration and Power Interface */
+    RDTSCP, /*!< \brief Serializing Read of the Time Stamp Counter */
+    VMX, /*!< \brief Virtual Machine eXtensions (VT-x) */
+    EIST, /*!< \brief Enhanced Intel SpeedStep */
+    TM, /*!< \brief Thermal Monitor */
+    TM2, /*!< \brief Thermal Monitor 2 */
+    AES, /*!< \brief AES instruction set */
+    RDRAND, /*!< \brief Random numbers from an on-chip hardware random number generator */
+    SSSE3, /*!< \brief Supplemental Streaming SIMD Extensions 3 */
+    SSE41, /*!< \brief Streaming SIMD Extensions 4.1 */
+    SSE42, /*!< \brief Streaming SIMD Extensions 4.2 */
+    AVX, /*!< \brief Advanced Vector Extensions */
+    FMA, /*!< \brief Fused multiply-add (FMA3) */
+    AVX2, /*!< \brief Advanced Vector Extensions 2 */
+    RTM, /*!< \brief Restricted Transactional Memory */
+    HLE, /*!< \brief Hardware Lock Elision */
+    HTT, /*!< \brief Hyper-Threading Technology */
+    RDSEED, /*!< \brief Non-deterministic random bit generator */
+} FeatureBit;
+/** @}*/
+#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/tree.h b/src/includes/tree.h
index 9816cf7..66cfa97 100644
--- a/src/includes/tree.h
+++ b/src/includes/tree.h
@@ -6,13 +6,13 @@
  *      Description:  Header File tree Module. 
  *                    Implements a simple tree data structure.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,7 @@
 
 extern void tree_init(TreeNode** root, int id);
 extern void tree_print(TreeNode* nodePtr);
+extern void tree_destroy(TreeNode* nodePtr);
 extern void tree_insertNode(TreeNode* nodePtr, int id);
 extern int tree_nodeExists(TreeNode* nodePtr, int id);
 extern int tree_countChildren(TreeNode* nodePtr);
diff --git a/src/includes/tree_types.h b/src/includes/tree_types.h
index b449e39..d2eb7d5 100644
--- a/src/includes/tree_types.h
+++ b/src/includes/tree_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for tree module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,14 +31,24 @@
 #ifndef TREE_TYPES_H
 #define TREE_TYPES_H
 
-/* For arbitrary trees llink are the children and
- * rlink are the neighbours
- */
-typedef struct treeNode {
-    int id;
-    struct treeNode* llink;
-    struct treeNode* rlink;
-} TreeNode;
 
+/** \addtogroup CPUTopology
+*  @{
+*/
+/*! \brief Structure of a tree node
+
+This structure is used to form the tree of the system topology. The information
+describing each node is store in other places, therefore an ID is enough.
+\extends CpuTopology
+*/
+struct treeNode {
+    int id; /*!< \brief ID of the node */
+    struct treeNode* llink; /*!< \brief List of children of the current node */
+    struct treeNode* rlink; /*!< \brief List of neighbors of the current node */
+};
+
+/** \brief Shorter name for struct treeNode */
+typedef struct treeNode TreeNode;
+/** @}*/
 
 #endif /*TREE_TYPES_H*/
diff --git a/src/includes/types.h b/src/includes/types.h
index 2b0745a..c32d870 100644
--- a/src/includes/types.h
+++ b/src/includes/types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Global  Types file
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,27 +35,17 @@
 
 /* #####   HEADER FILE INCLUDES   ######################################### */
 #include <stdint.h>
+#include <bstrlib.h>
 
-#include <accessClient_types.h>
+#include <access_client_types.h>
+#include <registers_types.h>
 #include <pci_types.h>
 #include <power_types.h>
 #include <thermal_types.h>
-#include <strUtil_types.h>
-#include <test_types.h>
-#include <barrier_types.h>
-#include <timer_types.h>
 #include <tree_types.h>
-#include <cpuid_types.h>
-#include <affinity_types.h>
-#include <threads_types.h>
-#include <cpuFeatures_types.h>
-#include <asciiBoxes_types.h>
-#include <asciiTable_types.h>
+#include <topology_types.h>
 #include <perfmon_types.h>
 #include <libperfctr_types.h>
-#include <multiplex_types.h>
-#include <numa_types.h>
-#include <pci_types.h>
 
 
 typedef struct {
@@ -83,4 +74,7 @@ typedef struct {
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+
 #endif /*TYPES_H*/
diff --git a/src/libperfctr.c b/src/libperfctr.c
index a4b2158..6f0ff0f 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Marker API interface of module perfmon
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -38,51 +39,34 @@
 #include <unistd.h>
 #include <sched.h>
 #include <pthread.h>
+#include <inttypes.h>
 
-#include <error.h>
-#include <types.h>
+#include <likwid.h>
 #include <bitUtil.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
 #include <lock.h>
 #include <tree.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <pci.h>
-#include <power.h>
-#include <thermal.h>
 #include <timer.h>
 #include <hashTable.h>
 #include <registers.h>
-#include <likwid.h>
-
-#include <perfmon_core2_counters.h>
-#include <perfmon_haswell_counters.h>
-#include <perfmon_interlagos_counters.h>
-#include <perfmon_kabini_counters.h>
-#include <perfmon_k10_counters.h>
-#include <perfmon_nehalem_counters.h>
-#include <perfmon_phi_counters.h>
-#include <perfmon_pm_counters.h>
-#include <perfmon_sandybridge_counters.h>
-#include <perfmon_ivybridge_counters.h>
-#include <perfmon_westmereEX_counters.h>
-#include <perfmon_silvermont_counters.h>
+#include <error.h>
+#include <access.h>
 
+#include <perfmon.h>
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 
-static int perfmon_numCounters=0;     /* total number of counters */
-static int perfmon_numCountersCore=0; /* max index of core counters */
-static int perfmon_numCountersUncore=0; /* max index of conventional uncore counters */
-static PerfmonCounterMap* perfmon_counter_map = NULL;
-static int socket_lock[MAX_NUM_NODES];
-static int thread_socketFD[MAX_NUM_THREADS];
-static int hasPCICounters = 0;
+int socket_lock[MAX_NUM_NODES];
 static int likwid_init = 0;
-static BitMask counterMask;
+static int numberOfGroups = 0;
+static int* groups;
+static int threads2Cpu[MAX_NUM_THREADS];
+static pthread_t threads2Pthread[MAX_NUM_THREADS];
+static int realThreads2Cpu[MAX_NUM_THREADS] = { [ 0 ... (MAX_NUM_THREADS-1)] = -1};
+static int num_cpus = 0;
+static int registered_cpus = 0;
+static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER;
+static int use_locks = 0;
+static pthread_mutex_t threadLocks[MAX_NUM_THREADS] = { [ 0 ... (MAX_NUM_THREADS-1)] = PTHREAD_MUTEX_INITIALIZER};
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
@@ -90,61 +74,87 @@ static BitMask counterMask;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
-void str2BitMask(const char* str, BitMask* mask)
+
+static int getProcessorID(cpu_set_t* cpu_set)
 {
-    char* endptr;
-    errno = 0;
-    struct bstrList* tokens;
-    bstring q = bfromcstralloc (60, str);
-    tokens = bsplit(q,' ');
+    int processorId;
 
-    for (int i=0; i<tokens->qty; i++)
+    for (processorId=0;processorId<MAX_NUM_THREADS;processorId++)
     {
-        uint64_t val =  strtoull((char*) tokens->entry[i]->data, &endptr, 16);
-
-        if ((errno == ERANGE && val == LONG_MAX ) || (errno != 0 && val == 0))
+        if (CPU_ISSET(processorId,cpu_set))
         {
-            ERROR;
+            break;
         }
+    }
+    return processorId;
+}
 
-        if (endptr == str)
+static int getThreadID(int cpu_id)
+{
+    int i;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        if (cpu_id == groupSet->threads[i].processorId)
         {
-            ERROR_PLAIN_PRINT(No digits were found);
+            return i;
         }
-
-        mask->mask[i] = val;
     }
-
-    bstrListDestroy(tokens);
-    bdestroy(q);
+    return -1;
 }
 
-static int getProcessorID(cpu_set_t* cpu_set)
+static double
+calculateMarkerResult(RegisterIndex index, uint64_t start, uint64_t stop, int overflows)
 {
-    int processorId;
+    double result = 0.0;
 
-    for (processorId=0;processorId<MAX_NUM_THREADS;processorId++)
+    if (overflows == 0)
     {
-        if (CPU_ISSET(processorId,cpu_set))
-        {
-            break;
-        }
+        result = (double) (stop - start);
     }
-    return processorId;
+    else if (overflows > 0)
+    {
+        result += (double) ((perfmon_getMaxCounterValue(counter_map[index].type) - start) + stop);
+        overflows--;
+    }
+    result += (double) (overflows * perfmon_getMaxCounterValue(counter_map[index].type));
+    if (counter_map[index].type == POWER)
+    {
+        result *= power_getEnergyUnit(getCounterTypeOffset(index));
+    }
+    else if (counter_map[index].type == THERMAL)
+    {
+        result = (double)stop;
+    }
+    return result;
 }
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void likwid_markerInit(void)
 {
-    int cpuId = likwid_getProcessorId();
+    int i;
+    int verbosity;
+    bstring bThreadStr;
+    bstring bEventStr;
+    struct bstrList* threadTokens;
+    struct bstrList* eventStrings;
     char* modeStr = getenv("LIKWID_MODE");
-    char* maskStr = getenv("LIKWID_MASK");
-
-    if ((modeStr != NULL) && (maskStr != NULL))
+    char* eventStr = getenv("LIKWID_EVENTS");
+    char* cThreadStr = getenv("LIKWID_THREADS");
+    char* filepath = getenv("LIKWID_FILEPATH");
+    /* Dirty hack to avoid nonnull warnings */
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+
+    if ((modeStr != NULL) && (filepath != NULL) && (eventStr != NULL) && (cThreadStr != NULL))
     {
         likwid_init = 1;
     }
+    else if (likwid_init == 0)
+    {
+        fprintf(stderr, "Running without Marker API. Activate Marker API with -m on commandline.\n");
+        return;
+    }
     else
     {
         return;
@@ -156,226 +166,130 @@ void likwid_markerInit(void)
         exit(EXIT_FAILURE);
     }
 
-    cpuid_init();
+    topology_init();
     numa_init();
     affinity_init();
-    timer_init();
     hashTable_init();
 
-    for(int i=0; i<MAX_NUM_THREADS; i++) thread_socketFD[i] = -1;
     for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
 
-    accessClient_mode = atoi(modeStr);
-    str2BitMask(maskStr, &counterMask);
+    HPMmode(atoi(modeStr));
 
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+    if (getenv("LIKWID_DEBUG") != NULL)
     {
-        accessClient_init(&thread_socketFD[cpuId]);
+        perfmon_verbosity = atoi(getenv("LIKWID_DEBUG"));
+        verbosity = perfmon_verbosity;
     }
 
-    msr_init(thread_socketFD[cpuId]);
-    thermal_init(cpuId);
-
-    switch ( cpuid_info.family )
+    bThreadStr = bfromcstr(cThreadStr);
+    threadTokens = bsplit(bThreadStr,',');
+    num_cpus = threadTokens->qty;
+    for (i=0; i<num_cpus; i++)
     {
-        case P6_FAMILY:
-
-            switch ( cpuid_info.model )
+        threads2Cpu[i] = ownatoi(bdata(threadTokens->entry[i]));
+    }
+    bdestroy(bThreadStr);
+    bstrListDestroy(threadTokens);
+    
+    if (getenv("LIKWID_PIN") != NULL)
+    {
+        likwid_pinThread(threads2Cpu[0]);
+        if (getenv("OMP_NUM_THREADS") != NULL)
+        {
+            if (ownatoi(getenv("OMP_NUM_THREADS")) > num_cpus)
             {
-                case PENTIUM_M_BANIAS:
-
-                case PENTIUM_M_DOTHAN:
-
-                    perfmon_counter_map = pm_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_PM;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_PM;
-                    break;
-
-                case ATOM_45:
-
-                case ATOM_32:
-
-                case ATOM_22:
-
-                case ATOM:
-
-                    perfmon_counter_map = core2_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_CORE2;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
-                    break;
-
-                case ATOM_SILVERMONT_C:
-                case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    power_init(0);
-                    perfmon_counter_map = silvermont_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_SILVERMONT;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_SILVERMONT;
-                    break;
-
-                case CORE_DUO:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
-
-                case XEON_MP:
-
-                case CORE2_65:
-
-                case CORE2_45:
-
-                    perfmon_counter_map = core2_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_CORE2;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
-                    break;
-
-                case NEHALEM_EX:
-
-                case WESTMERE_EX:
-
-                    perfmon_counter_map = westmereEX_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_WESTMEREEX;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_WESTMEREEX;
-                    perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_WESTMEREEX;
-                    break;
-
-                case NEHALEM_BLOOMFIELD:
-
-                case NEHALEM_LYNNFIELD:
-
-                case NEHALEM_WESTMERE_M:
-
-                case NEHALEM_WESTMERE:
-
-                    perfmon_counter_map = nehalem_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_NEHALEM;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_NEHALEM;
-                    perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_NEHALEM;
-                    break;
-
-                case IVYBRIDGE:
-
-                case IVYBRIDGE_EP:
-
-                    {
-                        int socket_fd = thread_socketFD[cpuId];
-                        hasPCICounters = 1;
-                        power_init(0); /* FIXME Static coreId is dangerous */
-                        pci_init(socket_fd);
-                        perfmon_counter_map = ivybridge_counter_map;
-                        perfmon_numCounters = NUM_COUNTERS_IVYBRIDGE;
-                        perfmon_numCountersCore = NUM_COUNTERS_CORE_IVYBRIDGE;
-                        perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_IVYBRIDGE;
-                    }
-                    break;
-
-                case HASWELL:
-
-                case HASWELL_EX:
-
-                case HASWELL_M1:
-
-                case HASWELL_M2:
-
-                    power_init(0); /* FIXME Static coreId is dangerous */
-
-                    perfmon_counter_map = haswell_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_HASWELL;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_HASWELL;
-                    break;
-
-                case SANDYBRIDGE:
-
-                case SANDYBRIDGE_EP:
-
-                    {
-                        int socket_fd = thread_socketFD[cpuId];
-                        hasPCICounters = 1;
-                        power_init(0); /* FIXME Static coreId is dangerous */
-                        pci_init(socket_fd);
-                        perfmon_counter_map = sandybridge_counter_map;
-                        perfmon_numCounters = NUM_COUNTERS_SANDYBRIDGE;
-                        perfmon_numCountersCore = NUM_COUNTERS_CORE_SANDYBRIDGE;
-                        perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_SANDYBRIDGE;
-                    }
-                    break;
-
-                default:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
+                use_locks = 1;
             }
-            break;
-
-        case MIC_FAMILY:
-
-            switch ( cpuid_info.model )
+        }
+        if (getenv("CILK_NWORKERS") != NULL)
+        {
+            if (ownatoi(getenv("CILK_NWORKERS")) > num_cpus)
             {
-                case XEON_PHI:
-
-                    perfmon_counter_map = phi_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_PHI;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_PHI;
-                    break;
-
-                default:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
+                use_locks = 1;
             }
-            break;
-
-        case K8_FAMILY:
-
-            perfmon_counter_map = k10_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_K10;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
-            break;
-
-        case K10_FAMILY:
-
-            perfmon_counter_map = k10_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_K10;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
-            break;
-
-        case K15_FAMILY:
-
-            perfmon_counter_map = interlagos_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_INTERLAGOS;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_INTERLAGOS;
-            break;
+        }
+    }
 
-        case K16_FAMILY:
+    i = perfmon_init(num_cpus, threads2Cpu);
+    if (i<0)
+    {
+        fprintf(stderr,"Failed to initialize LIKWID perfmon library.\n");
+        return;
+    }
 
-            perfmon_counter_map = kabini_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_KABINI;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_KABINI;
-            break;
+    bEventStr = bfromcstr(eventStr);
+    eventStrings = bsplit(bEventStr,'|');
+    numberOfGroups = eventStrings->qty;
+    groups = malloc(numberOfGroups * sizeof(int));
+    if (!groups)
+    {
+        fprintf(stderr,"Cannot allocate space for group handling.\n");
+        bstrListDestroy(eventStrings);
+        exit(EXIT_FAILURE);
+    }
+    for (i=0; i<eventStrings->qty; i++)
+    {
+        groups[i] = perfmon_addEventSet(bdata(eventStrings->entry[i]));
+    }
+    bstrListDestroy(eventStrings);
+    bdestroy(bEventStr);
 
-        default:
-            ERROR_PLAIN_PRINT(Unsupported Processor);
-            break;
+    for (i=0; i<num_cpus; i++)
+    {
+        hashTable_initThread(threads2Cpu[i]);
+        for(int j=0; j<groupSet->groups[groups[0]].numberOfEvents;j++)
+        {
+            groupSet->groups[groups[0]].events[j].threadCounter[i].init = TRUE;
+            groupSet->groups[groups[0]].state = STATE_START;
+        }
     }
+
+    groupSet->activeGroup = 0;
 }
 
 void likwid_markerThreadInit(void)
 {
-    if ( ! likwid_init )
+    int myID;
+    if ( !likwid_init )
     {
         return;
     }
+    
+    pthread_mutex_lock(&globalLock);
+    myID = registered_cpus++;
+    pthread_mutex_unlock(&globalLock);
 
-    int cpuId = likwid_getProcessorId();
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+    if (getenv("LIKWID_PIN") != NULL)
     {
-        if (thread_socketFD[cpuId] == -1)
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        sched_getaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
+        if ((CPU_COUNT(&cpuset) > 1) || (likwid_getProcessorId() != threads2Cpu[myID % num_cpus]))
         {
-            accessClient_init(&thread_socketFD[cpuId]);
+            likwid_pinThread(threads2Cpu[myID % num_cpus]);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, "Pin thread %lu to CPU %d\n", gettid(), threads2Cpu[myID % num_cpus]);
         }
     }
 }
 
+void likwid_markerNextGroup(void)
+{
+    int i;
+    int next_group;
+
+    if (!likwid_init)
+    {
+        return;
+    }
+
+    next_group = (groupSet->activeGroup + 1) % numberOfGroups;
+    if (next_group != groupSet->activeGroup)
+    {
+        i = perfmon_switchActiveGroup(next_group);
+    }
+    return;
+}
+
 /* File format
  * 1 numberOfThreads numberOfRegions
  * 2 regionID:regionTag0
@@ -387,45 +301,67 @@ void likwid_markerClose(void)
 {
     FILE *file = NULL;
     LikwidResults* results = NULL;
-    int numberOfThreads;
-    int numberOfRegions;
+    int numberOfThreads = 0;
+    int numberOfRegions = 0;
+    char* markerfile = NULL;
+    int lineidx = 0;
+    char line[1024];
 
     if ( ! likwid_init )
     {
         return;
     }
-
     hashTable_finalize(&numberOfThreads, &numberOfRegions, &results);
-
-    file = fopen(getenv("LIKWID_FILEPATH"),"w");
+    if ((numberOfThreads == 0)||(numberOfThreads == 0))
+    {
+        fprintf(stderr, "No threads or regions defined in hash table\n");
+        return;
+    }
+    markerfile = getenv("LIKWID_FILEPATH");
+    if (markerfile == NULL)
+    {
+        fprintf(stderr, "Is the application executed with LIKWID wrapper? No file path for the Marker API output defined.\n");
+        return;
+    }
+    file = fopen(markerfile,"w");
 
     if (file != NULL)
     {
-        fprintf(file,"%d %d\n",numberOfThreads,numberOfRegions);
-
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Creating Marker file %s with %d regions %d groups and %d threads, markerfile, numberOfRegions, numberOfGroups, numberOfThreads);
+        fprintf(file,"%d %d %d\n",numberOfThreads, numberOfRegions, numberOfGroups);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, %d %d %d, numberOfThreads, numberOfRegions, numberOfGroups);
         for (int i=0; i<numberOfRegions; i++)
         {
             fprintf(file,"%d:%s\n",i,bdata(results[i].tag));
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, %d:%s, i,bdata(results[i].tag));
         }
-
         for (int i=0; i<numberOfRegions; i++)
         {
             for (int j=0; j<numberOfThreads; j++)
             {
                 fprintf(file,"%d ",i);
-                fprintf(file,"%d ",j);
+                fprintf(file,"%d ",results[i].groupID);
+                fprintf(file,"%d ",results[i].cpulist[j]);
                 fprintf(file,"%u ",results[i].count[j]);
                 fprintf(file,"%e ",results[i].time[j]);
-
-                for (int k=0; k<NUM_PMC; k++)
+                fprintf(file,"%d ",groupSet->groups[results[i].groupID].numberOfEvents);
+                lineidx = sprintf(&(line[0]), "%d %d %d %u %e %d ", i, results[i].groupID,results[i].cpulist[j],results[i].count[j],results[i].time[j],groupSet->groups[results[i].groupID].numberOfEvents);
+                for (int k=0; k<groupSet->groups[results[i].groupID].numberOfEvents; k++)
                 {
                     fprintf(file,"%e ",results[i].counters[j][k]);
+                    lineidx += sprintf(&(line[lineidx]), "%e ", results[i].counters[j][k]);
                 }
                 fprintf(file,"\n");
+                DEBUG_PRINT(DEBUGLEV_DEVELOP, %s,line);
             }
         }
         fclose(file);
     }
+    else
+    {
+        fprintf(stderr, "Cannot open file %s\n", markerfile);
+        fprintf(stderr, "%s", strerror(errno));
+    }
 
     for (int i=0;i<numberOfRegions; i++)
     {
@@ -436,6 +372,7 @@ void likwid_markerClose(void)
         free(results[i].time);
         bdestroy(results[i].tag);
         free(results[i].count);
+        free(results[i].cpulist);
         free(results[i].counters);
     }
 
@@ -443,282 +380,179 @@ void likwid_markerClose(void)
     {
         free(results);
     }
-
-    msr_finalize();
-    pci_finalize();
-
-    for (int i=0; i<MAX_NUM_THREADS; i++)
-    {
-        accessClient_finalize(thread_socketFD[i]);
-        thread_socketFD[i] = -1;
-    }
+    likwid_init = 0;
+    HPMfinalize();
 }
 
-
-void likwid_markerStartRegion(const char* regionTag)
+int likwid_markerRegisterRegion(const char* regionTag)
 {
     if ( ! likwid_init )
     {
-        return;
+        return -EFAULT;
     }
-
+    TimerData timer;
     bstring tag = bfromcstralloc(100, regionTag);
     LikwidThreadResults* results;
-    uint64_t res;
+    char groupSuffix[10];
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
     int cpu_id = hashTable_get(tag, &results);
     bdestroy(tag);
-    int socket_fd = thread_socketFD[cpu_id];
+    return 0;
+}
 
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+
+int likwid_markerStartRegion(const char* regionTag)
+{
+    if ( ! likwid_init )
     {
-        if (socket_fd == -1)
-        {
-            printf("ERROR: Invalid socket file handle on processor %d. \
-                    Did you call likwid_markerThreadInit() ?\n", cpu_id);
-        }
+        return -EFAULT;
     }
-
-    results->count++;
-
-    /* Core specific counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
+    int myCPU = likwid_getProcessorId();
+    if (getThreadID(myCPU) < 0)
     {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                results->StartPMcounters[i] =
-                    (double) msr_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-            }
-        }
+        return -EFAULT;
     }
 
-    /* Uncore specific counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*)
-                &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
+    bstring tag = bfromcstralloc(100, regionTag);
+    LikwidThreadResults* results;
+    char groupSuffix[10];
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+    
+    int cpu_id = hashTable_get(tag, &results);
+    int thread_id = getThreadID(cpu_id);
+    perfmon_readCountersCpu(cpu_id);
+    results->cpuID = cpu_id;
+    for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
     {
-        /* Conventional Uncore counters */
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
-        {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if (perfmon_counter_map[i].type != POWER)
-                {
-                    results->StartPMcounters[i] =
-                        (double) msr_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].counterRegister);
-                }
-                else
-                {
-                    results->StartPMcounters[i] =
-                        (double) power_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].counterRegister);
-                }
-            }
-        }
-
-        /* PCI Uncore counters */
-        if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
-        {
-            for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
-            {
-                bitMask_test(res,counterMask,i);
-                if ( res )
-                {
-                    uint64_t counter_result =
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister);
-
-                    counter_result = (counter_result<<32) +
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister2);
-
-                    results->StartPMcounters[perfmon_counter_map[i].index] =
-                        (double) counter_result;
-                }
-            }
-        }
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, START [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu , regionTag, thread_id, cpu_id, i,
+                        LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+        //groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].startData =
+        //        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData;
+        
+        results->StartPMcounters[i] = groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData;
+        results->StartOverflows[i] = groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].overflows;
     }
-
+    
+    bdestroy(tag);
     timer_start(&(results->startTime));
+    return 0;
 }
 
-#define READ_END_MEM_CHANNEL(channel, reg, cid)                      \
-    counter_result = pci_tread(socket_fd, cpu_id, channel, reg##_A); \
-    counter_result = (counter_result<<32) +                          \
-    pci_tread(socket_fd, cpu_id, channel, reg##_B);                  \
-    results->PMcounters[cid] += (double) counter_result - results->StartPMcounters[cid]
-
 
-/* TODO: Readout hash at the end. Compute result at the end of the function to
- * keep overhead in region low */
 
-void likwid_markerStopRegion(const char* regionTag)
+int likwid_markerStopRegion(const char* regionTag)
 {
     if (! likwid_init)
     {
-        return;
+        return -EFAULT;
     }
 
     TimerData timestamp;
     timer_stop(&timestamp);
-    int cpu_id = likwid_getProcessorId();
-    uint64_t res;
-    int socket_fd = thread_socketFD[cpu_id];
-    double PMcounters[NUM_PMC];
-
-    /* Core specific counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
+    double result = 0.0;
+    int cpu_id;
+    int myCPU = likwid_getProcessorId();
+    if (getThreadID(myCPU) < 0)
     {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                PMcounters[i] = (double) msr_tread(
-                        socket_fd,
-                        cpu_id,
-                        perfmon_counter_map[i].counterRegister);
-            }
-            else
-            {
-                PMcounters[i] = (double) thermal_read(cpu_id);
-            }
-        }
+        return -EFAULT;
     }
+    int thread_id;
+    bstring tag = bfromcstr(regionTag);
+    char groupSuffix[100];
+    LikwidThreadResults* results;
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+    if (use_locks == 1)
+    {
+        pthread_mutex_lock(&threadLocks[myCPU]);
+    }
+    
+    cpu_id = hashTable_get(tag, &results);
+    thread_id = getThreadID(cpu_id);
+    results->groupID = groupSet->activeGroup;
+    results->startTime.stop.int64 = timestamp.stop.int64;
+    results->time += timer_print(&(results->startTime));
+    results->count++;
+    bdestroy(tag);
+
+    perfmon_readCountersCpu(cpu_id);
 
-    /* Uncore specific counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
     {
-        /* Conventional Uncore counters */
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, STOP [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu, regionTag, thread_id, cpu_id, i,
+                        LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+        result = calculateMarkerResult(groupSet->groups[groupSet->activeGroup].events[i].index, results->StartPMcounters[i],
+                                        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData,
+                                        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].overflows - 
+                                        results->StartOverflows[i]);
+        if (counter_map[groupSet->groups[groupSet->activeGroup].events[i].index].type != THERMAL)
         {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if (perfmon_counter_map[i].type != POWER)
-                {
-                    PMcounters[i] = (double) msr_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-                }
-                else
-                {
-                    PMcounters[i] = (double) power_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-                }
-            }
+            results->PMcounters[i] += result;
         }
-
-        /* PCI Uncore counters */
-        if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
+        else
         {
-            for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
-            {
-                bitMask_test(res,counterMask,i);
-                if ( res )
-                {
-                    uint64_t counter_result =
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister);
-
-                    counter_result = (counter_result<<32) +
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister2);
-
-                    PMcounters[i] = (double) counter_result;
-                }
-            }
+            results->PMcounters[i] = result;
         }
     }
+    if (use_locks == 1)
+    {
+        pthread_mutex_unlock(&threadLocks[myCPU]);
+    }
+    return 0;
+}
 
-    bstring tag = bfromcstralloc(100, regionTag);
-    LikwidThreadResults* results;
-    hashTable_get(tag, &results);
-    results->startTime.stop = timestamp.stop;
-    results->time += timer_print(&(results->startTime));
-    bdestroy(tag);
 
-    /* Accumulate the results */
-    /* Core counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
+void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count)
+{
+    if (! likwid_init)
     {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
-            }
-            else
-            {
-                results->PMcounters[i] = PMcounters[i];
-            }
-        }
+        *nr_events = 0;
+        *time = 0;
+        *count = 0;
+        return;
     }
-
-    /* Uncore counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    int length = 0;
+    int cpu_id;
+    int myCPU = likwid_getProcessorId();
+    int thread_id;
+    bstring tag = bfromcstr(regionTag);
+    char groupSuffix[100];
+    LikwidThreadResults* results;
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+
+    cpu_id = hashTable_get(tag, &results);
+    thread_id = getThreadID(myCPU);
+    *count = results->count;
+    *time = results->time;
+    length = MIN(groupSet->groups[groupSet->activeGroup].numberOfEvents, *nr_events);
+    for(int i=0;i<length;i++)
     {
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCounters; i++ )
-        {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if ( perfmon_counter_map[i].type == POWER )
-                {
-                    if (PMcounters[i] >= results->StartPMcounters[i])
-                    {
-                        results->PMcounters[i] += power_info.energyUnit *
-                            (PMcounters[i] - results->StartPMcounters[i]);
-                    }
-                    else
-                    {
-                        results->PMcounters[i] += power_info.energyUnit *
-                            (((double)0xFFFFFFFF) - results->StartPMcounters[i] + PMcounters[i]);
-                    }
-                }
-                else
-                {
-                    results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
-                }
-            }
-        }
+        events[i] = results->PMcounters[i];
     }
+    *nr_events = length;
+    bdestroy(tag);
+    return;
 }
 
+
 int  likwid_getProcessorId()
 {
+    int i;
     cpu_set_t  cpu_set;
     CPU_ZERO(&cpu_set);
     sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set);
-
-    return getProcessorID(&cpu_set);
+    if (CPU_COUNT(&cpu_set) > 1)
+    {
+        return sched_getcpu();
+    }
+    else
+    {
+        return getProcessorID(&cpu_set);
+    }
+    return -1;
 }
 
 #ifdef HAS_SCHEDAFFINITY
@@ -735,7 +569,7 @@ int  likwid_pinThread(int processorId)
 
     if (ret != 0)
     {
-        ERROR;
+        ERROR_PRINT("ERROR: Pinning of thread to CPU %d failed\n", processorId);
         return FALSE;
     }
 
@@ -755,7 +589,7 @@ int  likwid_pinProcess(int processorId)
 
     if (ret < 0)
     {
-        ERROR;
+        ERROR_PRINT("ERROR: Pinning of process to CPU %d failed\n", processorId);
         return FALSE;
     }
 
diff --git a/src/likwid.f90 b/src/likwid.f90
index 1215dd4..f7096e5 100644
--- a/src/likwid.f90
+++ b/src/likwid.f90
@@ -4,13 +4,14 @@
 !
 !     Description: Marker API f90 module
 !
-!      Version:   3.1.3
-!      Released:  4.11.2014
+!      Version:   4.1
+!      Released:  19.5.2016
 !
-!     Author:  Jan Treibig (jt), jan.treibig at gmail.com
+!     Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+!               Thomas Roehl (tr), thomas.roehl at googlemail.com
 !     Project:  likwid
 !
-!      Copyright (C) 2014 Jan Treibig
+!      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 !
 !      This program is free software: you can redistribute it and/or modify it under
 !      the terms of the GNU General Public License as published by the Free Software
@@ -26,28 +27,95 @@
 !
 ! =======================================================================================
 
+!> \defgroup Fortran_Interface Likwid Fortran90 Module
 
-
+!> \ingroup Fortran_Interface
+!> Likwid Fortran90 Module for embedding the Marker API into Fortran applications
+!> In the basic configuration the module is compiled with the Intel Fortran Compiler
 module likwid
 
 interface
 
-    subroutine likwid_markerInit()
-    end subroutine likwid_markerInit
+!> \ingroup Fortran_Interface
+!> \brief Initialize the Likwid Marker API
+!! This routine initializes the Marker API for Fortran. It reads some 
+!! environment commonly set by likwid-perfctr.
+!! \note Must be called once in a serial region.
+  subroutine likwid_markerInit()
+  end subroutine likwid_markerInit
+
+!> \ingroup Fortran_Interface
+!> \brief Add current thread to Likwid for Marker API measurements
+!! This routine adds the current thread to Likwid that it performs measurements
+!! for this thread. If using the daemon access mode, it starts a deamon for the
+!! current thread.
+!! \note  Must be called once in a parallel region.
+  subroutine likwid_markerThreadInit()
+  end subroutine likwid_markerThreadInit
+
+!> \ingroup Fortran_Interface
+!> \brief Setup performance counters for the next event set
+!> If multiple groups should be measured this function
+!> switches to the next group in a round robin fashion.
+!> Each call reprogramms the performance counters for the current CPU,
+!> \note Do not call it while measuring a code region.
+  subroutine likwid_markerNextGroup()
+  end subroutine likwid_markerNextGroup
+
+!> \ingroup Fortran_Interface
+!> \brief Close the Likwid Marker API
+!> Close the Likwid Marker API and write measured results to temporary file
+!> for evaluation done by likwid-perfctr
+!> \note Must be called once in a serial region and no further
+!> Likwid calls should be used
+  subroutine likwid_markerClose()
+  end subroutine likwid_markerClose
+
+!> \ingroup Fortran_Interface
+!> \brief Register a code region
+!> Initializes the hash table with an empty entry to reduce the overhead
+!> at likwid_markerStartRegion()
+  subroutine likwid_markerRegisterRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerRegisterRegion
 
-    subroutine likwid_markerThreadInit()
-    end subroutine likwid_markerThreadInit
 
-    subroutine likwid_markerClose()
-    end subroutine likwid_markerClose
+!> \ingroup Fortran_Interface
+!> \brief Start the measurement for a code region
+!> Reads the currently running event set and store the results as start values.
+!> for the measurement group identified by regionTag
+  subroutine likwid_markerStartRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerStartRegion
 
-    subroutine likwid_markerStartRegion( regionTag )
-    character(*) :: regionTag
-    end subroutine likwid_markerStartRegion
+!> \ingroup Fortran_Interface
+!> \brief Stop the measurement for a code region
+!> Reads the currently running event set and accumulate the difference between
+!> stop and start data in the measurement group identified by regionTag.
+  subroutine likwid_markerStopRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerStopRegion
 
-    subroutine likwid_markerStopRegion( regionTag )
-    character(*) :: regionTag
-    end subroutine likwid_markerStopRegion
+!> \ingroup Fortran_Interface
+!> \brief Get accumulated measurement results for a code region
+!> Get the accumulated data in the measurement group identified by regionTag
+!> for the current thread.
+!> \warning Experimental
+  subroutine likwid_markerGetRegion( regionTag, nr_events, events, time, count )
+!> \param regionTag [in] Name for the code region for later identification
+!> \param nr_events [in,out] Length of the events array
+!> \param events [out] Events array to store intermediate results
+!> \param time [out] Accumulated measurement time
+!> \param count [out] Call count of the region
+  character(*) :: regionTag
+  INTEGER :: nr_events
+  DOUBLE PRECISION, DIMENSION(*) :: events
+  DOUBLE PRECISION :: time
+  INTEGER :: count
+  end subroutine likwid_markerGetRegion
 
 end interface
 
diff --git a/src/likwid_f90_interface.c b/src/likwid_f90_interface.c
index 31bad92..51285ec 100644
--- a/src/likwid_f90_interface.c
+++ b/src/likwid_f90_interface.c
@@ -5,13 +5,14 @@
  *
  *      Description: F90 interface for marker API
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *               Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,22 +34,44 @@
 
 #include <likwid.h>
 
-void likwid_markerinit_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerinit_(void)
 {
     likwid_markerInit();
 }
 
-void likwid_markerthreadinit_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerthreadinit_(void)
 {
     likwid_markerThreadInit();
 }
 
-void likwid_markerclose_(void)
+void __attribute__ ((visibility ("default") )) likwid_markerclose_(void)
 {
     likwid_markerClose();
 }
 
-void likwid_markerstartregion_(char* regionTag, int len)
+void __attribute__ ((visibility ("default") )) likwid_markernextgroup_(void)
+{
+    likwid_markerNextGroup();
+}
+
+void __attribute__ ((visibility ("default") )) likwid_markerregisterregion_(char* regionTag, int len)
+{
+    char* tmp = (char*) malloc((len+1) * sizeof(char) );
+    strncpy(tmp, regionTag, len * sizeof(char) );
+
+    for (int i=(len-1); len > 0; len--)
+    {
+        if (tmp[i] != ' ') {
+            tmp[i+1] = 0;
+            break;
+        }
+    }
+
+    likwid_markerRegisterRegion( tmp );
+    free(tmp);
+}
+
+void __attribute__ ((visibility ("default") )) likwid_markerstartregion_(char* regionTag, int len)
 {
     char* tmp = (char*) malloc((len+1) * sizeof(char) );
     strncpy(tmp, regionTag, len * sizeof(char) );
@@ -65,7 +88,7 @@ void likwid_markerstartregion_(char* regionTag, int len)
     free(tmp);
 }
 
-void likwid_markerstopregion_(char* regionTag, int len)
+void __attribute__ ((visibility ("default") )) likwid_markerstopregion_(char* regionTag, int len)
 {
     char* tmp = (char*) malloc((len+1) * sizeof(char));
     strncpy(tmp, regionTag, len * sizeof(char) );
@@ -82,3 +105,19 @@ void likwid_markerstopregion_(char* regionTag, int len)
     free(tmp);
 }
 
+void __attribute__ ((visibility ("default") )) likwid_markergetregion_(char* regionTag, int* nr_events, double* events, double *time, int *count, int len)
+{
+    char* tmp = (char*) malloc((len+1) * sizeof(char));
+    strncpy(tmp, regionTag, len * sizeof(char) );
+
+    for (int i=(len-1); len > 0; len--)
+    {
+        if (tmp[i] != ' ') {
+            tmp[i+1] = 0;
+            break;
+        }
+    }
+    likwid_markerGetRegion( tmp, nr_events,  events, time, count);
+    free(tmp);
+}
+
diff --git a/src/loadData.S b/src/loadData.S
new file mode 100644
index 0000000..86de4d6
--- /dev/null
+++ b/src/loadData.S
@@ -0,0 +1,44 @@
+.intel_syntax noprefix
+
+.text
+.globl _loadData
+.type _loadData, @function
+_loadData :
+#ifdef __x86_64
+xor rax, rax
+.align 16
+1:
+mov  r8,  [rsi + rax]
+mov  r9,  [rsi + rax + 64]
+mov  r10, [rsi + rax + 128]
+mov r11,  [rsi + rax + 192]
+add rax, 256
+cmp rax, rdi
+jb 1b
+
+ret
+#else
+#ifdef __i386__
+push	ebp
+mov	ebp, esp
+push edi
+push esi
+xor eax, eax
+1:
+mov edi, DWORD PTR [ebp + eax + 12]
+mov esi, DWORD PTR [ebp + eax + 76]
+mov ecx, DWORD PTR [ebp + eax + 140]
+mov edx, DWORD PTR [ebp + eax + 204]
+add eax, 256
+cmp eax, DWORD PTR [ebp+8]
+jb 1b
+pop esi
+pop edi
+mov esp, ebp
+pop ebp
+ret
+#endif
+#endif
+.size _loadData, .-_loadData
+
+
diff --git a/src/loadData.s b/src/loadData.s
deleted file mode 100644
index e176c53..0000000
--- a/src/loadData.s
+++ /dev/null
@@ -1,22 +0,0 @@
-.intel_syntax noprefix
-
-.text
-.globl _loadData
-.type _loadData, @function
-_loadData :
-
-xor rax, rax
-.align 16
-1:
-mov  r8,  [rsi + rax]
-mov  r9,  [rsi + rax + 64]
-mov  r10, [rsi + rax + 128]
-mov r11,  [rsi + rax + 192]
-add rax, 256
-cmp rax, rdi
-jb 1b
-
-ret
-.size _loadData, .-_loadData
-
-
diff --git a/src/loadData.s.tmp b/src/loadData.s.tmp
deleted file mode 100644
index e69de29..0000000
diff --git a/src/luawid.c b/src/luawid.c
new file mode 100644
index 0000000..6e5ced8
--- /dev/null
+++ b/src/luawid.c
@@ -0,0 +1,2334 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  luawid.c
+ *
+ *      Description:  C part of the Likwid Lua interface
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+
+#include <lua.h>                               /* Always include this */
+#include <lauxlib.h>                           /* Always include this */
+#include <lualib.h>                            /* Always include this */
+
+#include <likwid.h>
+#include <tree.h>
+#include <access.h>
+#include <bstrlib.h>
+
+#ifdef COLOR
+#include <textcolor.h>
+#endif
+
+static int topology_isInitialized = 0;
+CpuInfo_t cpuinfo = NULL;
+CpuTopology_t cputopo = NULL;
+
+static int numa_isInitialized = 0;
+NumaTopology_t numainfo = NULL;
+static int affinity_isInitialized = 0;
+AffinityDomains_t affinity = NULL;
+static int perfmon_isInitialized = 0;
+static int timer_isInitialized = 0;
+static int power_isInitialized = 0;
+PowerInfo_t power;
+static int power_hasRAPL = 0;
+static int config_isInitialized = 0;
+Configuration_t configfile = NULL;
+
+
+static int lua_likwid_getConfiguration(lua_State* L)
+{
+    int ret = 0;
+    if (config_isInitialized == 0)
+    {
+        ret = init_configuration();
+        if (ret == 0)
+        {
+            config_isInitialized = 1;
+            configfile = get_configuration();
+        }
+        else
+        {
+            lua_newtable(L);
+            lua_pushstring(L, "configFile");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "topologyFile");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "daemonPath");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "groupPath");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "daemonMode");
+            lua_pushinteger(L, -1);
+            lua_settable(L,-3);
+            lua_pushstring(L, "maxNumThreads");
+            lua_pushinteger(L, 0);
+            lua_settable(L,-3);
+            lua_pushstring(L, "maxNumNodes");
+            lua_pushinteger(L, 0);
+            lua_settable(L,-3);
+            return 1;
+        }
+    }
+    if ((config_isInitialized) && (configfile == NULL))
+    {
+        configfile = get_configuration();
+    }
+    lua_newtable(L);
+    lua_pushstring(L, "configFile");
+    lua_pushstring(L, configfile->configFileName);
+    lua_settable(L,-3);
+    lua_pushstring(L, "topologyFile");
+    lua_pushstring(L, configfile->topologyCfgFileName);
+    lua_settable(L,-3);
+    lua_pushstring(L, "daemonPath");
+    lua_pushstring(L, configfile->daemonPath);
+    lua_settable(L,-3);
+    lua_pushstring(L, "groupPath");
+    lua_pushstring(L, configfile->groupPath);
+    lua_settable(L,-3);
+    lua_pushstring(L, "daemonMode");
+    lua_pushinteger(L, (int)configfile->daemonMode);
+    lua_settable(L,-3);
+    lua_pushstring(L, "maxNumThreads");
+    lua_pushinteger(L, configfile->maxNumThreads);
+    lua_settable(L,-3);
+    lua_pushstring(L, "maxNumNodes");
+    lua_pushinteger(L, configfile->maxNumNodes);
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putConfiguration(lua_State* L)
+{
+    if (config_isInitialized == 1)
+    {
+        destroy_configuration();
+        config_isInitialized = 0;
+        configfile = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_setGroupPath(lua_State* L)
+{
+    int ret;
+    const char* tmpString;
+    if (config_isInitialized == 0)
+    {
+        ret = init_configuration();
+        if (ret == 0)
+        {
+            config_isInitialized = 1;
+        }
+    }
+    tmpString = luaL_checkstring(L, 1);
+    ret = config_setGroupPath((char*)tmpString);
+    if (ret < 0)
+    {
+        lua_pushstring(L,"Cannot set group path");
+        lua_error(L);
+    }
+    return 0;
+}
+
+static int lua_likwid_setAccessMode(lua_State* L)
+{
+    int flag;
+    flag = luaL_checknumber(L,1);
+    luaL_argcheck(L, flag >= 0 && flag <= 1, 1, "invalid access mode, only 0 (direct) and 1 (accessdaemon) allowed");
+    HPMmode(flag);
+    lua_pushinteger(L,0);
+    return 1;
+}
+
+static int lua_likwid_init(lua_State* L)
+{
+    int ret;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "CPU count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (ret = 1; ret<=nrThreads; ret++)
+    {
+        lua_rawgeti(L,-1,ret);
+        cpus[ret-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+        lua_pop(L,1);
+    }
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        numa_init();
+        numa_isInitialized = 1;
+        numainfo = get_numaTopology();
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    if (perfmon_isInitialized == 0)
+    {
+        ret = perfmon_init(nrThreads, &(cpus[0]));
+        if (ret != 0)
+        {
+            lua_pushstring(L,"Cannot initialize likwid perfmon");
+            perfmon_finalize();
+            lua_pushinteger(L,ret);
+            return 1;
+        }
+        perfmon_isInitialized = 1;
+        timer_isInitialized = 1;
+        lua_pushinteger(L,ret);
+    }
+    return 1;
+}
+
+
+static int lua_likwid_addEventSet(lua_State* L)
+{
+    int groupId, n;
+    const char* tmpString;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    n = lua_gettop(L);
+    tmpString = luaL_checkstring(L, n);
+    luaL_argcheck(L, strlen(tmpString) > 0, n, "Event string must be larger than 0");
+
+    groupId = perfmon_addEventSet((char*)tmpString);
+    lua_pushinteger(L, groupId+1);
+    return 1;
+}
+
+static int lua_likwid_setupCounters(lua_State* L)
+{
+    int ret;
+    int groupId = lua_tonumber(L,1);
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_setupCounters(groupId-1);
+    lua_pushinteger(L,ret);
+    return 1;
+}
+
+
+static int lua_likwid_startCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_startCounters();
+    lua_pushinteger(L,ret);
+    return 1;
+}
+
+static int lua_likwid_stopCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_stopCounters();
+    lua_pushinteger(L,ret);
+    return 1;
+}
+
+static int lua_likwid_readCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_readCounters();
+    lua_pushinteger(L,ret);
+    return 1;
+}
+
+static int lua_likwid_switchGroup(lua_State* L)
+{
+    int ret = -1;
+    int newgroup = lua_tonumber(L,1)-1;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    if (newgroup >= perfmon_getNumberOfGroups())
+    {
+        newgroup = 0;
+    }
+    if (newgroup == perfmon_getIdOfActiveGroup())
+    {
+        lua_pushinteger(L, ret);
+        return 1;
+    }
+    ret = perfmon_switchActiveGroup(newgroup);
+    lua_pushinteger(L, ret);
+    return 1;
+}
+
+static int lua_likwid_finalize(lua_State* L)
+{
+    if (perfmon_isInitialized == 1)
+    {
+        perfmon_finalize();
+        perfmon_isInitialized = 0;
+    }
+    if (affinity_isInitialized == 1)
+    {
+        affinity_finalize();
+        affinity_isInitialized = 0;
+        affinity = NULL;
+    }
+    if (numa_isInitialized == 1)
+    {
+        numa_finalize();
+        numa_isInitialized = 0;
+        numainfo = NULL;
+    }
+    if (topology_isInitialized == 1)
+    {
+        topology_finalize();
+        topology_isInitialized = 0;
+        cputopo = NULL;
+        cpuinfo = NULL;
+    }
+    if (timer_isInitialized == 1)
+    {
+        timer_finalize();
+        timer_isInitialized = 0;
+    }
+    if (config_isInitialized == 1)
+    {
+        destroy_configuration();
+        config_isInitialized = 0;
+        configfile = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_getResult(lua_State* L)
+{
+    int groupId, eventId, threadId;
+    double result = 0;
+    groupId = lua_tonumber(L,1);
+    eventId = lua_tonumber(L,2);
+    threadId = lua_tonumber(L,3);
+    result = perfmon_getResult(groupId-1, eventId-1, threadId-1);
+    lua_pushnumber(L,result);
+    return 1;
+}
+
+static int lua_likwid_getLastResult(lua_State* L)
+{
+    int groupId, eventId, threadId;
+    double result = 0;
+    groupId = lua_tonumber(L,1);
+    eventId = lua_tonumber(L,2);
+    threadId = lua_tonumber(L,3);
+    result = perfmon_getLastResult(groupId-1, eventId-1, threadId-1);
+    lua_pushnumber(L,result);
+    return 1;
+}
+
+static int lua_likwid_getMetric(lua_State* L)
+{
+    int groupId, metricId, threadId;
+    double result = 0;
+    groupId = lua_tonumber(L,1);
+    metricId = lua_tonumber(L,2);
+    threadId = lua_tonumber(L,3);
+    result = perfmon_getMetric(groupId-1, metricId-1, threadId-1);
+    lua_pushnumber(L,result);
+    return 1;
+}
+
+static int lua_likwid_getLastMetric(lua_State* L)
+{
+    int groupId, metricId, threadId;
+    double result = 0;
+    groupId = lua_tonumber(L,1);
+    metricId = lua_tonumber(L,2);
+    threadId = lua_tonumber(L,3);
+    result = perfmon_getLastMetric(groupId-1, metricId-1, threadId-1);
+    lua_pushnumber(L,result);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfGroups(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getNumberOfGroups();
+    lua_pushinteger(L,number);
+    return 1;
+}
+
+static int lua_likwid_getIdOfActiveGroup(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getIdOfActiveGroup();
+    lua_pushinteger(L,number+1);
+    return 1;
+}
+
+static int lua_likwid_getRuntimeOfGroup(lua_State* L)
+{
+    double time;
+    int groupId;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    time = perfmon_getTimeOfGroup(groupId-1);
+    lua_pushnumber(L, time);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfEvents(lua_State* L)
+{
+    int number, groupId;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    number = perfmon_getNumberOfEvents(groupId-1);
+    lua_pushinteger(L,number);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfThreads(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getNumberOfThreads();
+    lua_pushinteger(L,number);
+    return 1;
+}
+
+static int lua_likwid_getNameOfEvent(lua_State* L)
+{
+    int eventId, groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    eventId = lua_tonumber(L,2);
+    tmp = perfmon_getEventName(groupId-1, eventId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getNameOfCounter(lua_State* L)
+{
+    int eventId, groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    eventId = lua_tonumber(L,2);
+    tmp = perfmon_getCounterName(groupId-1, eventId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfMetrics(lua_State* L)
+{
+    int number, groupId;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    number = perfmon_getNumberOfMetrics(groupId-1);
+    lua_pushinteger(L,number);
+    return 1;
+}
+
+static int lua_likwid_getNameOfMetric(lua_State* L)
+{
+    int metricId, groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    metricId = lua_tonumber(L,2);
+    tmp = perfmon_getMetricName(groupId-1, metricId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getNameOfGroup(lua_State* L)
+{
+    int groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    tmp = perfmon_getGroupName(groupId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getShortInfoOfGroup(lua_State* L)
+{
+    int groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    tmp = perfmon_getGroupInfoShort(groupId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getLongInfoOfGroup(lua_State* L)
+{
+    int groupId;
+    char* tmp;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    tmp = perfmon_getGroupInfoLong(groupId-1);
+    lua_pushstring(L,tmp);
+    return 1;
+}
+
+static int lua_likwid_getGroups(lua_State* L)
+{
+    int i, ret;
+    char** tmp, **infos, **longs;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+    }
+    ret = perfmon_getGroups(&tmp, &infos, &longs);
+    if (ret > 0)
+    {
+        lua_newtable(L);
+        for (i=0;i<ret;i++)
+        {
+            lua_pushinteger(L, (lua_Integer)( i+1));
+            lua_newtable(L);
+            lua_pushstring(L, "Name");
+            lua_pushstring(L, tmp[i]);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Info");
+            lua_pushstring(L, infos[i]);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Long");
+            lua_pushstring(L, longs[i]);
+            lua_settable(L,-3);
+            lua_settable(L,-3);
+        }
+        perfmon_returnGroups(ret, tmp, infos, longs);
+        return 1;
+    }
+    return 0;
+}
+
+
+static int lua_likwid_printSupportedCPUs(lua_State* L)
+{
+    print_supportedCPUs();
+    return 0;
+}
+
+static int lua_likwid_getCpuInfo(lua_State* L)
+{
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"family");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->family));
+    lua_settable(L,-3);
+    lua_pushstring(L,"model");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->model));
+    lua_settable(L,-3);
+    lua_pushstring(L,"stepping");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->stepping));
+    lua_settable(L,-3);
+    lua_pushstring(L,"clock");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->clock));
+    lua_settable(L,-3);
+    lua_pushstring(L,"turbo");
+    lua_pushinteger(L,cpuinfo->turbo);
+    lua_settable(L,-3);
+    lua_pushstring(L,"name");
+    lua_pushstring(L,cpuinfo->name);
+    lua_settable(L,-3);
+    lua_pushstring(L,"osname");
+    lua_pushstring(L,cpuinfo->osname);
+    lua_settable(L,-3);
+    lua_pushstring(L,"short_name");
+    lua_pushstring(L,cpuinfo->short_name);
+    lua_settable(L,-3);
+    lua_pushstring(L,"features");
+    lua_pushstring(L,cpuinfo->features);
+    lua_settable(L,-3);
+    lua_pushstring(L,"isIntel");
+    lua_pushinteger(L,cpuinfo->isIntel);
+    lua_settable(L,-3);
+    lua_pushstring(L,"featureFlags");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->featureFlags));
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_version");
+    lua_pushinteger(L, (lua_Integer)( cpuinfo->perf_version));
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_num_ctr");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_num_ctr));
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_width_ctr");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_width_ctr));
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_num_fixed_ctr");
+    lua_pushinteger(L, (lua_Integer)(cpuinfo->perf_num_fixed_ctr));
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_getCpuTopology(lua_State* L)
+{
+    int i;
+    TreeNode* socketNode;
+    int socketCount = 0;
+    TreeNode* coreNode;
+    int coreCount = 0;
+    TreeNode* threadNode;
+    int threadCount = 0;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+
+    lua_newtable(L);
+
+    lua_pushstring(L,"numHWThreads");
+    lua_pushinteger(L, (lua_Integer)(cputopo->numHWThreads));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"activeHWThreads");
+    lua_pushinteger(L, (lua_Integer)(cputopo->activeHWThreads));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numSockets");
+    lua_pushinteger(L, (lua_Integer)(cputopo->numSockets));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numCoresPerSocket");
+    lua_pushinteger(L, (lua_Integer)(cputopo->numCoresPerSocket));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numThreadsPerCore");
+    lua_pushinteger(L, (lua_Integer)(cputopo->numThreadsPerCore));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numCacheLevels");
+    lua_pushinteger(L,cputopo->numCacheLevels);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"threadPool");
+    lua_newtable(L);
+    for(i=0;i<cputopo->numHWThreads;i++)
+    {
+        lua_pushnumber(L,i);
+        lua_newtable(L);
+        lua_pushstring(L,"threadId");
+        lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].threadId));
+        lua_settable(L,-3);
+        lua_pushstring(L,"coreId");
+        lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].coreId));
+        lua_settable(L,-3);
+        lua_pushstring(L,"packageId");
+        lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].packageId));
+        lua_settable(L,-3);
+        lua_pushstring(L,"apicId");
+        lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].apicId));
+        lua_settable(L,-3);
+        lua_pushstring(L,"inCpuSet");
+        lua_pushinteger(L, (lua_Integer)(cputopo->threadPool[i].inCpuSet));
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"cacheLevels");
+    lua_newtable(L);
+    for(i=0;i<cputopo->numCacheLevels;i++)
+    {
+        lua_pushnumber(L,i+1);
+        lua_newtable(L);
+
+        lua_pushstring(L,"level");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].level));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"associativity");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].associativity));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"sets");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].sets));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"lineSize");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].lineSize));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"size");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].size));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"threads");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].threads));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"inclusive");
+        lua_pushinteger(L, (lua_Integer)(cputopo->cacheLevels[i].inclusive));
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"type");
+        switch (cputopo->cacheLevels[i].type)
+        {
+            case DATACACHE:
+                lua_pushstring(L,"DATACACHE");
+                break;
+            case INSTRUCTIONCACHE:
+                lua_pushstring(L,"INSTRUCTIONCACHE");
+                break;
+            case UNIFIEDCACHE:
+                lua_pushstring(L,"UNIFIEDCACHE");
+                break;
+            case ITLB:
+                lua_pushstring(L,"ITLB");
+                break;
+            case DTLB:
+                lua_pushstring(L,"DTLB");
+                break;
+            case NOCACHE:
+            default:
+                lua_pushstring(L,"NOCACHE");
+                break;
+        }
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"topologyTree");
+    lua_newtable(L);
+
+    socketNode = tree_getChildNode(cputopo->topologyTree);
+    while (socketNode != NULL)
+    {
+        lua_pushinteger(L, socketCount);
+        lua_newtable(L);
+        lua_pushstring(L, "ID");
+        lua_pushinteger(L, (lua_Integer)(socketNode->id));
+        lua_settable(L, -3);
+        lua_pushstring(L, "Childs");
+        lua_newtable(L);
+        coreCount = 0;
+        coreNode = tree_getChildNode(socketNode);
+        while (coreNode != NULL)
+        {
+            lua_pushinteger(L, coreCount);
+            lua_newtable(L);
+            lua_pushstring(L, "ID");
+            lua_pushinteger(L, (lua_Integer)(coreNode->id));
+            lua_settable(L,-3);
+            lua_pushstring(L, "Childs");
+            lua_newtable(L);
+            threadNode = tree_getChildNode(coreNode);
+            threadCount = 0;
+            while (threadNode != NULL)
+            {
+                lua_pushinteger(L, (lua_Integer)(threadCount));
+                lua_pushinteger(L, (lua_Integer)(threadNode->id));
+                lua_settable(L,-3);
+                threadNode = tree_getNextNode(threadNode);
+                threadCount++;
+            }
+            lua_settable(L,-3);
+            coreNode = tree_getNextNode(coreNode);
+            coreCount++;
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        socketNode = tree_getNextNode(socketNode);
+        socketCount++;
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putTopology(lua_State* L)
+{
+    if (topology_isInitialized == 1)
+    {
+        topology_finalize();
+        topology_isInitialized = 0;
+        cpuinfo = NULL;
+        cputopo = NULL;
+    }
+    return 0;
+}
+
+
+static int lua_likwid_getEventsAndCounters(lua_State* L)
+{
+    int i;
+    
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    perfmon_init_maps();
+    lua_newtable(L);
+    lua_pushstring(L,"Counters");
+    lua_newtable(L);
+    for(i=1;i<=perfmon_numCounters;i++)
+    {
+        bstring optString = bfromcstr("");
+        lua_pushinteger(L, (lua_Integer)(i));
+        lua_newtable(L);
+        lua_pushstring(L,"Name");
+        lua_pushstring(L,counter_map[i-1].key);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Options");
+        for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+        {
+            if (counter_map[i-1].optionMask & REG_TYPE_MASK(j))
+            {
+                bstring tmp = bformat("%s|", eventOptionTypeName[j]);
+                bconcat(optString, tmp);
+                bdestroy(tmp);
+            }
+        }
+        lua_pushstring(L,bdata(optString));
+        lua_settable(L,-3);
+        lua_pushstring(L,"Type");
+        lua_pushinteger(L, (lua_Integer)( counter_map[i-1].type));
+        lua_settable(L,-3);
+        lua_pushstring(L,"TypeName");
+        lua_pushstring(L, RegisterTypeNames[counter_map[i-1].type]);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Index");
+        lua_pushinteger(L, (lua_Integer)(counter_map[i-1].index));
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+        bdestroy(optString);
+    }
+    lua_settable(L,-3);
+    lua_pushstring(L,"Events");
+    lua_newtable(L);
+    for(i=1;i<=perfmon_numArchEvents;i++)
+    {
+        bstring optString = bfromcstr("");
+        lua_pushinteger(L, (lua_Integer)(i));
+        lua_newtable(L);
+        lua_pushstring(L,"Name");
+        lua_pushstring(L,eventHash[i-1].name);
+        lua_settable(L,-3);
+        lua_pushstring(L,"ID");
+        lua_pushinteger(L, (lua_Integer)(eventHash[i-1].eventId));
+        lua_settable(L,-3);
+        lua_pushstring(L,"UMask");
+        lua_pushinteger(L, (lua_Integer)(eventHash[i-1].umask));
+        lua_settable(L,-3);
+        lua_pushstring(L,"Limit");
+        lua_pushstring(L,eventHash[i-1].limit);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Options");
+        for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+        {
+            if (eventHash[i-1].optionMask & REG_TYPE_MASK(j))
+            {
+                bstring tmp = bformat("%s|", eventOptionTypeName[j]);
+                bconcat(optString, tmp);
+                bdestroy(tmp);
+            }
+        }
+        lua_pushstring(L,bdata(optString));
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+        bdestroy(optString);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_getOnlineDevices(lua_State* L)
+{
+    int i;
+    lua_newtable(L);
+    for(i=0;i<=MAX_NUM_PCI_DEVICES;i++)
+    {
+        if (pci_devices[i].online)
+        {
+            lua_pushstring(L,pci_devices[i].likwid_name);
+            lua_newtable(L);
+            lua_pushstring(L, "Name");
+            lua_pushstring(L,pci_devices[i].name);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Path");
+            lua_pushstring(L,pci_devices[i].path);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Type");
+            lua_pushstring(L,pci_types[pci_devices[i].type].name);
+            lua_settable(L,-3);
+            lua_pushstring(L, "TypeDescription");
+            lua_pushstring(L,pci_types[pci_devices[i].type].desc);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+    }
+    return 1;
+}
+
+static int lua_likwid_getNumaInfo(lua_State* L)
+{
+    uint32_t i,j;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+        else
+        {
+            lua_newtable(L);
+            lua_pushstring(L,"numberOfNodes");
+            lua_pushinteger(L, (lua_Integer)(0));
+            lua_settable(L,-3);
+            lua_pushstring(L,"nodes");
+            lua_newtable(L);
+            lua_settable(L,-3);
+            return 1;
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    if ((affinity_isInitialized) && (affinity == NULL))
+    {
+        affinity = get_affinityDomains();
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"numberOfNodes");
+    lua_pushinteger(L, (lua_Integer)(numainfo->numberOfNodes));
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"nodes");
+    lua_newtable(L);
+    for(i=0;i<numainfo->numberOfNodes;i++)
+    {
+        lua_pushinteger(L, i+1);
+        lua_newtable(L);
+        
+        lua_pushstring(L,"id");
+        lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].id));
+        lua_settable(L,-3);
+        lua_pushstring(L,"totalMemory");
+        lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].totalMemory));
+        lua_settable(L,-3);
+        lua_pushstring(L,"freeMemory");
+        lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].freeMemory));
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfProcessors");
+        lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].numberOfProcessors));
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfDistances");
+        lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].numberOfDistances));
+        lua_settable(L,-3);
+        
+        lua_pushstring(L,"processors");
+        lua_newtable(L);
+        for(j=0;j<numainfo->nodes[i].numberOfProcessors;j++)
+        {
+            lua_pushinteger(L, (lua_Integer)(j+1));
+            lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].processors[j]));
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        
+        /*lua_pushstring(L,"processorsCompact");
+        lua_newtable(L);
+        for(j=0;j<numa->nodes[i].numberOfProcessors;j++)
+        {
+            lua_pushinteger(L, (lua_Integer)(j);
+            lua_pushinteger(L, (lua_Integer)(numa->nodes[i].processorsCompact[j]);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);*/
+        
+        lua_pushstring(L,"distances");
+        lua_newtable(L);
+        for(j=0;j<numainfo->nodes[i].numberOfDistances;j++)
+        {
+            lua_pushinteger(L,j+1);
+            lua_newtable(L);
+            lua_pushinteger(L,j);
+            lua_pushinteger(L, (lua_Integer)(numainfo->nodes[i].distances[j]));
+            lua_settable(L,-3);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putNumaInfo(lua_State* L)
+{
+    if (numa_isInitialized)
+    {
+        numa_finalize();
+        numa_isInitialized = 0;
+        numainfo = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_setMemInterleaved(lua_State* L)
+{
+    int ret;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (ret = 1; ret<=nrThreads; ret++)
+    {
+        lua_rawgeti(L,-1,ret);
+        cpus[ret-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+        lua_pop(L,1);
+    }
+    numa_setInterleaved(cpus, nrThreads);
+    return 0;
+}
+
+static int lua_likwid_getAffinityInfo(lua_State* L)
+{
+    int i,j;
+    
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    if ((affinity_isInitialized) && (affinity == NULL))
+    {
+        affinity = get_affinityDomains();
+    }
+
+    if (!affinity)
+    {
+        lua_pushstring(L,"Cannot initialize affinity groups");
+        lua_error(L);
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"numberOfAffinityDomains");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfAffinityDomains));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfSocketDomains");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfSocketDomains));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfNumaDomains");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfNumaDomains));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfProcessorsPerSocket");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfProcessorsPerSocket));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfCacheDomains");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfCacheDomains));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfCoresPerCache");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfCoresPerCache));
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfProcessorsPerCache");
+    lua_pushinteger(L, (lua_Integer)(affinity->numberOfProcessorsPerCache));
+    lua_settable(L,-3);
+    lua_pushstring(L,"domains");
+    lua_newtable(L);
+    for(i=0;i<affinity->numberOfAffinityDomains;i++)
+    {
+        lua_pushinteger(L, (lua_Integer)( i+1));
+        lua_newtable(L);
+        lua_pushstring(L,"tag");
+        lua_pushstring(L,bdata(affinity->domains[i].tag));
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfProcessors");
+        lua_pushinteger(L, (lua_Integer)(affinity->domains[i].numberOfProcessors));
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfCores");
+        lua_pushinteger(L, (lua_Integer)(affinity->domains[i].numberOfCores));
+        lua_settable(L,-3);
+        lua_pushstring(L,"processorList");
+        lua_newtable(L);
+        for(j=0;j<affinity->domains[i].numberOfProcessors;j++)
+        {
+            lua_pushinteger(L, (lua_Integer)(j+1));
+            lua_pushinteger(L, (lua_Integer)(affinity->domains[i].processorList[j]));
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_cpustr_to_cpulist(lua_State* L)
+{
+    int ret = 0;
+    char* cpustr = (char *)luaL_checkstring(L, 1);
+    int* cpulist = (int*) malloc(MAX_NUM_THREADS * sizeof(int));
+    if (cpulist == NULL)
+    {
+        lua_pushstring(L,"Cannot allocate data for the CPU list");
+        lua_error(L);
+    }
+    ret = cpustr_to_cpulist(cpustr, cpulist, MAX_NUM_THREADS);
+    if (ret <= 0)
+    {
+        lua_pushstring(L,"Cannot parse cpustring");
+        lua_error(L);
+    }
+    lua_pushnumber(L, ret);
+    lua_newtable(L);
+    for (int i=0;i<ret;i++)
+    {
+        lua_pushinteger(L, (lua_Integer)( i+1));
+        lua_pushinteger(L, (lua_Integer)( cpulist[i]));
+        lua_settable(L,-3);
+    }
+    free(cpulist);
+    return 2;
+}
+
+static int lua_likwid_nodestr_to_nodelist(lua_State* L)
+{
+    int ret = 0;
+    char* nodestr = (char *)luaL_checkstring(L, 1);
+    int* nodelist = (int*) malloc(MAX_NUM_NODES * sizeof(int));
+    if (nodelist == NULL)
+    {
+        lua_pushstring(L,"Cannot allocate data for the node list");
+        lua_error(L);
+    }
+    ret = nodestr_to_nodelist(nodestr, nodelist, MAX_NUM_NODES);
+    if (ret <= 0)
+    {
+        lua_pushstring(L,"Cannot parse node string");
+        lua_error(L);
+    }
+    lua_pushnumber(L, ret);
+    lua_newtable(L);
+    for (int i=0;i<ret;i++)
+    {
+        lua_pushinteger(L, (lua_Integer)( i+1));
+        lua_pushinteger(L, (lua_Integer)( nodelist[i]));
+        lua_settable(L,-3);
+    }
+    free(nodelist);
+    return 2;
+}
+
+static int lua_likwid_sockstr_to_socklist(lua_State* L)
+{
+    int ret = 0;
+    char* sockstr = (char *)luaL_checkstring(L, 1);
+    int* socklist = (int*) malloc(MAX_NUM_NODES * sizeof(int));
+    if (socklist == NULL)
+    {
+        lua_pushstring(L,"Cannot allocate data for the socket list");
+        lua_error(L);
+    }
+    ret = nodestr_to_nodelist(sockstr, socklist, MAX_NUM_NODES);
+    if (ret <= 0)
+    {
+        lua_pushstring(L,"Cannot parse socket string");
+        lua_error(L);
+    }
+    lua_pushnumber(L, ret);
+    lua_newtable(L);
+    for (int i=0;i<ret;i++)
+    {
+        lua_pushinteger(L, (lua_Integer)( i+1));
+        lua_pushinteger(L, (lua_Integer)( socklist[i]));
+        lua_settable(L,-3);
+    }
+    free(socklist);
+    return 2;
+}
+
+static int lua_likwid_putAffinityInfo(lua_State* L)
+{
+    if (affinity_isInitialized)
+    {
+        affinity_finalize();
+        affinity_isInitialized = 0;
+        affinity = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_getPowerInfo(lua_State* L)
+{
+    
+    int i;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    if ((affinity_isInitialized) && (affinity == NULL))
+    {
+        affinity = get_affinityDomains();
+    }
+
+    if (power_isInitialized == 0)
+    {
+        power_hasRAPL = power_init(0);
+        for(i=0;i<affinity->numberOfAffinityDomains;i++)
+        {
+            if (bstrchrp(affinity->domains[i].tag, 'S', 0) != BSTR_ERR)
+            {
+                HPMaddThread(affinity->domains[i].processorList[0]);
+            }
+        }
+        if (power_hasRAPL)
+        {
+            power_isInitialized = 1;
+            power = get_powerInfo();
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
+
+    lua_newtable(L);
+    lua_pushstring(L,"hasRAPL");
+    lua_pushboolean(L,power_hasRAPL);
+    lua_settable(L,-3);
+    lua_pushstring(L,"baseFrequency");
+    lua_pushnumber(L,power->baseFrequency);
+    lua_settable(L,-3);
+    lua_pushstring(L,"minFrequency");
+    lua_pushnumber(L,power->minFrequency);
+    lua_settable(L,-3);
+    lua_pushstring(L,"powerUnit");
+    lua_pushnumber(L,power->powerUnit);
+    lua_settable(L,-3);
+    lua_pushstring(L,"timeUnit");
+    lua_pushnumber(L,power->timeUnit);
+    lua_settable(L,-3);
+    
+    lua_pushstring(L,"turbo");
+    lua_newtable(L);
+    lua_pushstring(L,"numSteps");
+    lua_pushinteger(L, (lua_Integer)(power->turbo.numSteps));
+    lua_settable(L,-3);
+    lua_pushstring(L,"steps");
+    lua_newtable(L);
+    for(i=0;i<power->turbo.numSteps;i++)
+    {
+        lua_pushinteger(L, (lua_Integer)(i+1));
+        lua_pushnumber(L,power->turbo.steps[i]);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"domains");
+    lua_newtable(L);
+    for(i=0;i<NUM_POWER_DOMAINS;i++)
+    {
+        lua_pushstring(L,power_names[i]);
+        lua_newtable(L);
+
+        lua_pushstring(L, "ID");
+        lua_pushnumber(L, power->domains[i].type);
+        lua_settable(L,-3);
+        lua_pushstring(L, "energyUnit");
+        lua_pushnumber(L, power->domains[i].energyUnit);
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportStatus");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportPerf");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportPolicy");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportLimit");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_INFO)
+        {
+            lua_pushstring(L,"supportInfo");
+            lua_pushboolean(L, 1);
+            lua_settable(L,-3);
+            lua_pushstring(L,"tdp");
+            lua_pushnumber(L, power->domains[i].tdp);
+            lua_settable(L,-3);
+            lua_pushstring(L,"minPower");
+            lua_pushnumber(L, power->domains[i].minPower);
+            lua_settable(L,-3);
+            lua_pushstring(L,"maxPower");
+            lua_pushnumber(L, power->domains[i].maxPower);
+            lua_settable(L,-3);
+            lua_pushstring(L,"maxTimeWindow");
+            lua_pushnumber(L, power->domains[i].maxTimeWindow);
+            lua_settable(L,-3);
+        }
+        else
+        {
+            lua_pushstring(L,"supportInfo");
+            lua_pushboolean(L, 0);
+            lua_settable(L,-3);
+        }
+
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    
+
+    return 1;
+}
+
+static int lua_likwid_putPowerInfo(lua_State* L)
+{
+    if (power_isInitialized)
+    {
+        power_finalize();
+        power_isInitialized = 0;
+        power = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_startPower(lua_State* L)
+{
+    PowerData pwrdata;
+    int cpuId = lua_tonumber(L,1);
+    luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+    PowerType type = (PowerType) ((lua_Unsigned)lua_tointegerx(L,2, NULL));
+    luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+    power_start(&pwrdata, cpuId, type-1);
+    lua_pushnumber(L,pwrdata.before);
+    return 1;
+}
+
+static int lua_likwid_stopPower(lua_State* L)
+{
+    PowerData pwrdata;
+    int cpuId = lua_tonumber(L,1);
+    luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+    PowerType type = (PowerType) ((lua_Unsigned)lua_tointegerx(L,2, NULL));
+    luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+    power_stop(&pwrdata, cpuId, type-1);
+    lua_pushnumber(L,pwrdata.after);
+    return 1;
+}
+
+static int lua_likwid_printEnergy(lua_State* L)
+{
+    PowerData pwrdata;
+    pwrdata.before = lua_tonumber(L,1);
+    pwrdata.after = lua_tonumber(L,2);
+    pwrdata.domain = lua_tonumber(L,3);
+    lua_pushnumber(L,power_printEnergy(&pwrdata));
+    return 1;
+}
+
+static int lua_likwid_power_limitGet(lua_State* L)
+{
+    int err;
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    double power = 0.0;
+    double time = 0.0;
+    err = power_limitGet(cpuId, domain, &power, &time);
+    if (err < 0)
+    {
+        lua_pushnumber(L,err);
+        return 1;
+    }
+    lua_pushnumber(L,power);
+    lua_pushnumber(L,time);
+    return 2;
+}
+
+static int lua_likwid_power_limitSet(lua_State* L)
+{
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    double power = lua_tonumber(L,3);
+    double time = lua_tonumber(L,4);
+    int clamp  = lua_tonumber(L,5);
+    lua_pushinteger(L, power_limitSet(cpuId, domain, power, time, clamp));
+    return 1;
+}
+
+static int lua_likwid_power_limitState(lua_State* L)
+{
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    lua_pushnumber(L,power_limitState(cpuId, domain));
+    return 1;
+}
+
+static int lua_likwid_getCpuClock(lua_State* L)
+{
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    lua_pushnumber(L,timer_getCpuClock());
+    return 1;
+}
+
+static int lua_likwid_getCycleClock(lua_State* L)
+{
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    lua_pushnumber(L,timer_getCycleClock());
+    return 1;
+}
+
+static int lua_sleep(lua_State* L)
+{
+    lua_pushnumber(L, timer_sleep(((lua_Unsigned)lua_tointegerx(L,-1, NULL))));
+    return 1;
+}
+
+static int lua_likwid_startClock(lua_State* L)
+{
+    TimerData timer;
+    double value;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    timer_start(&timer);
+    value = (double)timer.start.int64;
+    lua_pushnumber(L, value);
+    return 1;
+}
+
+static int lua_likwid_stopClock(lua_State* L)
+{
+    TimerData timer;
+    double value;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    timer_stop(&timer);
+    value = (double)timer.stop.int64;
+    lua_pushnumber(L, value);
+    return 1;
+}
+
+static int lua_likwid_getClockCycles(lua_State* L)
+{
+    TimerData timer;
+    double start, stop;
+    start = lua_tonumber(L,1);
+    stop = lua_tonumber(L,2);
+    timer.start.int64 = (uint64_t)start;
+    timer.stop.int64 = (uint64_t)stop;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    lua_pushnumber(L, (double)timer_printCycles(&timer));
+    return 1;
+}
+
+static int lua_likwid_getClock(lua_State* L)
+{
+    TimerData timer;
+    double runtime, start, stop;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    start = lua_tonumber(L,1);
+    stop = lua_tonumber(L,2);
+    timer.start.int64 = (uint64_t)start;
+    timer.stop.int64 = (uint64_t)stop;
+    runtime = timer_print(&timer);
+    lua_pushnumber(L, runtime);
+    return 1;
+}
+
+static int lua_likwid_initTemp(lua_State* L)
+{
+    int cpuid = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+    thermal_init(cpuid);
+    return 0;
+}
+
+static int lua_likwid_readTemp(lua_State* L)
+{
+    int cpuid = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+    uint32_t data;
+    
+    if (thermal_read(cpuid, &data)) {
+        lua_pushstring(L,"Cannot read thermal data");
+        lua_error(L);
+    }
+    lua_pushnumber(L, data);
+    return 1;
+}
+
+
+static volatile int recv_sigint = 0;
+
+static void signal_catcher(int signo) 
+{
+    if (signo == SIGINT)
+    {
+        recv_sigint++;
+    }
+    return;
+}
+
+static int lua_likwid_catch_signal(lua_State* L)
+{
+    signal(SIGINT,signal_catcher);
+    return 0;
+}
+
+static int lua_likwid_return_signal_state(lua_State* L)
+{
+    lua_pushnumber(L, recv_sigint);
+    return 1;
+}
+
+void parse(char *line, char **argv)
+{
+     while (*line != '\0') {       /* if not the end of line ....... */ 
+          while (*line == ' ' || *line == '\t' || *line == '\n')
+               *line++ = '\0';     /* replace white spaces with 0    */
+          *argv++ = line;          /* save the argument position     */
+          while (*line != '\0' && *line != ' ' && 
+                 *line != '\t' && *line != '\n') 
+               line++;             /* skip the argument until ...    */
+     }
+     *argv = '\0';                 /* mark the end of argument list  */
+}
+
+static volatile int program_running = 0;
+
+static void catch_sigchild(int signo) {
+    program_running = 0;
+}
+
+static int lua_likwid_startProgram(lua_State* L)
+{
+    pid_t pid, ppid;
+    int status;
+    char *exec;
+    char *argv[4096];
+    exec = (char *)luaL_checkstring(L, 1);
+    int nrThreads = luaL_checknumber(L,2);
+    int cpus[MAX_NUM_THREADS];
+    cpu_set_t cpuset;
+    if (nrThreads > 0)
+    {
+        if (!lua_istable(L, -1)) {
+          lua_pushstring(L,"No table given as second argument");
+          lua_error(L);
+        }
+        for (status = 1; status<=nrThreads; status++)
+        {
+            lua_rawgeti(L,-1,status);
+            cpus[status-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+            lua_pop(L,1);
+        }
+    }
+    else
+    {
+        for (nrThreads = 0; nrThreads < cpuid_topology.numHWThreads; nrThreads++)
+            cpus[nrThreads] = cpuid_topology.threadPool[nrThreads].apicId;
+        nrThreads = cpuid_topology.numHWThreads;
+    }
+    parse(exec, argv);
+    ppid = getpid();
+    program_running = 1;
+    pid = fork();
+    if (pid < 0)
+    {
+        return 0;
+    }
+    else if ( pid == 0)
+    {
+        if (nrThreads > 0)
+        {
+            affinity_pinProcesses(nrThreads, cpus);
+        }
+        timer_sleep(10);
+        status = execvp(*argv, argv);
+        if (status < 0)
+        {
+            kill(ppid, SIGCHLD);
+        }
+        return 0;
+    }
+    else
+    {
+        signal(SIGCHLD, catch_sigchild);
+        lua_pushnumber(L, pid);
+    }
+    return 1;
+}
+static int lua_likwid_checkProgram(lua_State* L)
+{
+    if (lua_gettop(L) == 1)
+    {
+        int status;
+        pid_t retpid;
+        pid_t pid = lua_tonumber(L, 1);
+        retpid = waitpid(pid, &status, WNOHANG);
+        if (retpid == pid)
+            program_running = 0;
+    }
+    lua_pushboolean(L, program_running);
+    return 1;
+}
+
+static int lua_likwid_killProgram(lua_State* L)
+{
+    pid_t pid = lua_tonumber(L, 1);
+    kill(pid, SIGTERM);
+    program_running = 0;
+    return 0;
+}
+
+static int lua_likwid_waitwid(lua_State* L)
+{
+    int status;
+    pid_t pid = lua_tonumber(L, 1);
+    waitpid(pid, &status, 0);
+    return 0;
+}
+
+static int lua_likwid_memSweep(lua_State* L)
+{
+    int i;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (i = 1; i <= nrThreads; i++)
+    {
+        lua_rawgeti(L,-1,i);
+        cpus[i-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+        lua_pop(L,1);
+    }
+    memsweep_threadGroup(cpus, nrThreads);
+    return 0;
+}
+
+static int lua_likwid_memSweepDomain(lua_State* L)
+{
+    int domain = luaL_checknumber(L,1);
+    luaL_argcheck(L, domain >= 0, 1, "Domain ID must be greater or equal 0");
+    memsweep_domain(domain);
+    return 0;
+}
+
+static int lua_likwid_pinProcess(lua_State* L)
+{
+    int cpuID = luaL_checknumber(L,-2);
+    int silent = luaL_checknumber(L,-1);
+    luaL_argcheck(L, cpuID >= 0, 1, "CPU ID must be greater or equal 0");
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    affinity_pinProcess(cpuID);
+    if (!silent)
+    {
+#ifdef COLOR
+            color_on(BRIGHT, COLOR);
+#endif
+            printf("[likwid-pin] Main PID -> core %d - OK",  cpuID);
+#ifdef COLOR
+            color_reset();
+#endif
+            printf("\n");
+    }
+    return 0;
+}
+
+static int lua_likwid_setenv(lua_State* L)
+{
+    const char* element = (const char*)luaL_checkstring(L, -2);
+    const char* value = (const char*)luaL_checkstring(L, -1);
+    setenv(element, value, 1);
+    return 0;
+}
+
+static int lua_likwid_getpid(lua_State* L)
+{
+    lua_pushinteger(L, (lua_Integer)(getpid()));
+    return 1;
+}
+
+static int lua_likwid_setVerbosity(lua_State* L)
+{
+    int verbosity = lua_tointeger(L,-1);
+    luaL_argcheck(L, (verbosity >= 0 && verbosity <= DEBUGLEV_DEVELOP), -1, 
+                "Verbosity must be between 0 (only errors) and 3 (developer)");
+    perfmon_verbosity = verbosity;
+    return 0;
+}
+
+static int lua_likwid_access(lua_State* L)
+{
+    int flags = 0;
+    const char* file = (const char*)luaL_checkstring(L, 1);
+    const char* perm = (const char*)luaL_checkstring(L, 2);
+    if (!perm)
+    {
+        flags = F_OK;
+    }
+    else
+    {
+        for (int i=0;i<strlen(perm);i++)
+        {
+            if (perm[i] == 'r') {
+                flags |= R_OK;
+            } else if (perm[i] == 'w') {
+                flags |= W_OK;
+            } else if (perm[i] == 'x') {
+                flags |= X_OK;
+            } else if (perm[i] == 'e') {
+                flags |= F_OK;
+            }
+        }
+    }
+    if (file)
+    {
+        lua_pushinteger(L, access(file, flags));
+        return 1;
+    }
+    lua_pushinteger(L, -1);
+    return 1;
+}
+
+static int lua_likwid_markerInit(lua_State* L)
+{
+    likwid_markerInit();
+    return 0;
+}
+
+static int lua_likwid_markerThreadInit(lua_State* L)
+{
+    likwid_markerThreadInit();
+    return 0;
+}
+
+static int lua_likwid_markerClose(lua_State* L)
+{
+    likwid_markerClose();
+    return 0;
+}
+
+static int lua_likwid_markerNext(lua_State* L)
+{
+    likwid_markerNextGroup();
+    return 0;
+}
+
+static int lua_likwid_registerRegion(lua_State* L)
+{
+    const char* tag = (const char*)luaL_checkstring(L, -1);
+    lua_pushinteger(L, likwid_markerRegisterRegion(tag));
+    return 1;
+}
+
+static int lua_likwid_startRegion(lua_State* L)
+{
+    const char* tag = (const char*)luaL_checkstring(L, -1);
+    lua_pushinteger(L, likwid_markerStartRegion(tag));
+    return 1;
+}
+
+static int lua_likwid_stopRegion(lua_State* L)
+{
+    const char* tag = (const char*)luaL_checkstring(L, -1);
+    lua_pushinteger(L, likwid_markerStopRegion(tag));
+    return 1;
+}
+
+static int lua_likwid_getRegion(lua_State* L)
+{
+    int i = 0;
+    const char* tag = (const char*)luaL_checkstring(L, -2);
+    int nr_events = perfmon_getNumberOfEvents(perfmon_getIdOfActiveGroup());
+    double* events = NULL;
+    double time = 0.0;
+    int count = 0;
+    
+    events = (double*) malloc(nr_events * sizeof(double));
+    if (events == NULL)
+    {
+        lua_pushstring(L,"Cannot allocate memory for event data\n");
+        lua_error(L);
+    }
+    for (i = 0; i < nr_events; i++)
+    {
+        events[i] = 0.0;
+    }
+    likwid_markerGetRegion(tag, &nr_events, events, &time, &count);
+    
+    lua_pushinteger(L, nr_events);
+    lua_newtable(L);
+    for (i=0;i<nr_events;i++)
+    {
+        lua_pushinteger(L, i+1);
+        lua_pushnumber(L, events[i]);
+        lua_settable(L, -3);
+    }
+    lua_pushnumber(L, time);
+    lua_pushinteger(L, count);
+    free(events);
+    return 4;
+}
+
+static int lua_likwid_cpuFeatures_init(lua_State* L)
+{
+    cpuFeatures_init();
+    return 0;
+}
+
+static int lua_likwid_cpuFeatures_print(lua_State* L)
+{
+    int cpu = lua_tointeger(L,-1);
+    cpuFeatures_print(cpu);
+    return 0;
+}
+
+static int lua_likwid_cpuFeatures_get(lua_State* L)
+{
+    int cpu = lua_tointeger(L,-2);
+    CpuFeature feature = lua_tointeger(L,-1);
+    lua_pushinteger(L, cpuFeatures_get(cpu, feature));
+    return 1;
+}
+
+static int lua_likwid_cpuFeatures_name(lua_State* L)
+{
+    char* name = NULL;
+    CpuFeature feature = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+    name = cpuFeatures_name(feature);
+    if (name != NULL)
+    {
+        lua_pushstring(L, name);
+        return 1;
+    }
+    return 0;
+}
+
+static int lua_likwid_cpuFeatures_enable(lua_State* L)
+{
+    int cpu = lua_tointeger(L,-3);
+    CpuFeature feature = lua_tointeger(L,-2);
+    int verbose = lua_tointeger(L,-1);
+    lua_pushinteger(L, cpuFeatures_enable(cpu, feature, verbose));
+    return 1;
+}
+
+static int lua_likwid_cpuFeatures_disable(lua_State* L)
+{
+    int cpu = lua_tointeger(L,-3);
+    CpuFeature feature = lua_tointeger(L,-2);
+    int verbose = lua_tointeger(L,-1);
+    lua_pushinteger(L, cpuFeatures_disable(cpu, feature, verbose));
+    return 1;
+}
+
+static int lua_likwid_markerFile_read(lua_State* L)
+{
+    const char* filename = (const char*)luaL_checkstring(L, -1);
+    perfmon_readMarkerFile(filename);
+    return 0;
+}
+
+static int lua_likwid_markerFile_destroy(lua_State* L)
+{
+    perfmon_destroyMarkerResults();
+    return 0;
+}
+
+static int lua_likwid_markerNumRegions(lua_State* L)
+{
+    lua_pushinteger(L, perfmon_getNumberOfRegions());
+    return 1;
+}
+
+static int lua_likwid_markerRegionGroup(lua_State* L)
+{
+    int region = lua_tointeger(L,-1);
+    lua_pushinteger(L, perfmon_getGroupOfRegion(region-1)+1);
+    return 1;
+}
+
+static int lua_likwid_markerRegionTag(lua_State* L)
+{
+    int region = lua_tointeger(L,-1);
+    lua_pushstring(L, perfmon_getTagOfRegion(region-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionEvents(lua_State* L)
+{
+    int region = lua_tointeger(L,-1);
+    lua_pushinteger(L, perfmon_getEventsOfRegion(region-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionThreads(lua_State* L)
+{
+    int region = lua_tointeger(L,-1);
+    lua_pushinteger(L, perfmon_getThreadsOfRegion(region-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionCpulist(lua_State* L)
+{
+    int i = 0;
+    int region = lua_tointeger(L,-1);
+    int* cpulist;
+    int regionCPUs = 0;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    cpulist = (int*)malloc(cputopo->numHWThreads * sizeof(int));
+    if (cpulist == NULL)
+    {
+        return 0;
+    }
+    regionCPUs = perfmon_getCpulistOfRegion(region-1, cputopo->numHWThreads, cpulist);
+    if (regionCPUs > 0)
+    {
+        lua_newtable(L);
+        for (i=0; i < regionCPUs; i++)
+        {
+            lua_pushinteger(L, i+1);
+            lua_pushinteger(L, cpulist[i]);
+            lua_settable(L, -3);
+        }
+        return 1;
+    }
+    return 0;
+}
+
+static int lua_likwid_markerRegionTime(lua_State* L)
+{
+    int region = lua_tointeger(L,-2);
+    int thread = lua_tointeger(L,-1);
+    lua_pushnumber(L, perfmon_getTimeOfRegion(region-1, thread-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionCount(lua_State* L)
+{
+    int region = lua_tointeger(L,-2);
+    int thread = lua_tointeger(L,-1);
+    lua_pushinteger(L, perfmon_getCountOfRegion(region-1, thread-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionResult(lua_State* L)
+{
+    int region = lua_tointeger(L,-3);
+    int event = lua_tointeger(L,-2);
+    int thread = lua_tointeger(L,-1);
+    lua_pushnumber(L, perfmon_getResultOfRegionThread(region-1, event-1, thread-1));
+    return 1;
+}
+
+static int lua_likwid_markerRegionMetric(lua_State* L)
+{
+    int region = lua_tointeger(L,-3);
+    int metric = lua_tointeger(L,-2);
+    int thread = lua_tointeger(L,-1);
+    lua_pushnumber(L, perfmon_getMetricOfRegionThread(region-1, metric-1, thread-1));
+    return 1;
+}
+
+int __attribute__ ((visibility ("default") )) luaopen_liblikwid(lua_State* L){
+    // Configuration functions
+    lua_register(L, "likwid_getConfiguration", lua_likwid_getConfiguration);
+    lua_register(L, "likwid_setGroupPath", lua_likwid_setGroupPath);
+    lua_register(L, "likwid_putConfiguration", lua_likwid_putConfiguration);
+    // Perfmon functions
+    //lua_register(L, "accessClient_setaccessmode",lua_accessClient_setaccessmode);
+    lua_register(L, "likwid_setAccessClientMode",lua_likwid_setAccessMode);
+    lua_register(L, "likwid_init",lua_likwid_init);
+    lua_register(L, "likwid_addEventSet", lua_likwid_addEventSet);
+    lua_register(L, "likwid_setupCounters",lua_likwid_setupCounters);
+    lua_register(L, "likwid_startCounters",lua_likwid_startCounters);
+    lua_register(L, "likwid_stopCounters",lua_likwid_stopCounters);
+    lua_register(L, "likwid_readCounters",lua_likwid_readCounters);
+    lua_register(L, "likwid_switchGroup",lua_likwid_switchGroup);
+    lua_register(L, "likwid_finalize",lua_likwid_finalize);
+    lua_register(L, "likwid_getEventsAndCounters", lua_likwid_getEventsAndCounters);
+    // Perfmon results functions
+    lua_register(L, "likwid_getResult",lua_likwid_getResult);
+    lua_register(L, "likwid_getLastResult",lua_likwid_getLastResult);
+    lua_register(L, "likwid_getMetric",lua_likwid_getMetric);
+    lua_register(L, "likwid_getLastMetric",lua_likwid_getLastMetric);
+    lua_register(L, "likwid_getNumberOfGroups",lua_likwid_getNumberOfGroups);
+    lua_register(L, "likwid_getRuntimeOfGroup", lua_likwid_getRuntimeOfGroup);
+    lua_register(L, "likwid_getIdOfActiveGroup",lua_likwid_getIdOfActiveGroup);
+    lua_register(L, "likwid_getNumberOfEvents",lua_likwid_getNumberOfEvents);
+    lua_register(L, "likwid_getNumberOfMetrics",lua_likwid_getNumberOfMetrics);
+    lua_register(L, "likwid_getNumberOfThreads",lua_likwid_getNumberOfThreads);
+    lua_register(L, "likwid_getNameOfEvent",lua_likwid_getNameOfEvent);
+    lua_register(L, "likwid_getNameOfCounter",lua_likwid_getNameOfCounter);
+    lua_register(L, "likwid_getNameOfMetric",lua_likwid_getNameOfMetric);
+    lua_register(L, "likwid_getNameOfGroup",lua_likwid_getNameOfGroup);
+    lua_register(L, "likwid_getGroups",lua_likwid_getGroups);
+    lua_register(L, "likwid_getShortInfoOfGroup",lua_likwid_getShortInfoOfGroup);
+    lua_register(L, "likwid_getLongInfoOfGroup",lua_likwid_getLongInfoOfGroup);
+    // Topology functions
+    lua_register(L, "likwid_getCpuInfo",lua_likwid_getCpuInfo);
+    lua_register(L, "likwid_getCpuTopology",lua_likwid_getCpuTopology);
+    lua_register(L, "likwid_putTopology",lua_likwid_putTopology);
+    lua_register(L, "likwid_getNumaInfo",lua_likwid_getNumaInfo);
+    lua_register(L, "likwid_putNumaInfo",lua_likwid_putNumaInfo);
+    lua_register(L, "likwid_setMemInterleaved", lua_likwid_setMemInterleaved);
+    lua_register(L, "likwid_getAffinityInfo",lua_likwid_getAffinityInfo);
+    lua_register(L, "likwid_putAffinityInfo",lua_likwid_putAffinityInfo);
+    lua_register(L, "likwid_getPowerInfo",lua_likwid_getPowerInfo);
+    lua_register(L, "likwid_putPowerInfo",lua_likwid_putPowerInfo);
+    lua_register(L, "likwid_getOnlineDevices", lua_likwid_getOnlineDevices);
+    lua_register(L, "likwid_printSupportedCPUs", lua_likwid_printSupportedCPUs);
+    // CPU string parse functions
+    lua_register(L, "likwid_cpustr_to_cpulist",lua_likwid_cpustr_to_cpulist);
+    lua_register(L, "likwid_nodestr_to_nodelist",lua_likwid_nodestr_to_nodelist);
+    lua_register(L, "likwid_sockstr_to_socklist",lua_likwid_sockstr_to_socklist);
+    // Timer functions
+    lua_register(L, "likwid_getCpuClock",lua_likwid_getCpuClock);
+    lua_register(L, "likwid_getCycleClock",lua_likwid_getCycleClock);
+    lua_register(L, "likwid_startClock",lua_likwid_startClock);
+    lua_register(L, "likwid_stopClock",lua_likwid_stopClock);
+    lua_register(L, "likwid_getClockCycles",lua_likwid_getClockCycles);
+    lua_register(L, "likwid_getClock",lua_likwid_getClock);
+    lua_register(L, "sleep",lua_sleep);
+    // Power functions
+    lua_register(L, "likwid_startPower",lua_likwid_startPower);
+    lua_register(L, "likwid_stopPower",lua_likwid_stopPower);
+    lua_register(L, "likwid_printEnergy",lua_likwid_printEnergy);
+    lua_register(L, "likwid_powerLimitGet",lua_likwid_power_limitGet);
+    lua_register(L, "likwid_powerLimitSet",lua_likwid_power_limitSet);
+    lua_register(L, "likwid_powerLimitState",lua_likwid_power_limitState);
+    // Temperature functions
+    lua_register(L, "likwid_initTemp",lua_likwid_initTemp);
+    lua_register(L, "likwid_readTemp",lua_likwid_readTemp);
+    // MemSweep functions
+    lua_register(L, "likwid_memSweep", lua_likwid_memSweep);
+    lua_register(L, "likwid_memSweepDomain", lua_likwid_memSweepDomain);
+    // Pinning functions
+    lua_register(L, "likwid_pinProcess", lua_likwid_pinProcess);
+    // Helper functions
+    lua_register(L, "likwid_setenv", lua_likwid_setenv);
+    lua_register(L, "likwid_getpid", lua_likwid_getpid);
+    lua_register(L, "likwid_access", lua_likwid_access);
+    lua_register(L, "likwid_startProgram", lua_likwid_startProgram);
+    lua_register(L, "likwid_checkProgram", lua_likwid_checkProgram);
+    lua_register(L, "likwid_killProgram", lua_likwid_killProgram);
+    lua_register(L, "likwid_catchSignal", lua_likwid_catch_signal);
+    lua_register(L, "likwid_getSignalState", lua_likwid_return_signal_state);
+    lua_register(L, "likwid_waitwid", lua_likwid_waitwid);
+    // Verbosity functions
+    lua_register(L, "likwid_setVerbosity", lua_likwid_setVerbosity);
+    // Marker API functions
+    lua_register(L, "likwid_markerInit", lua_likwid_markerInit);
+    lua_register(L, "likwid_markerThreadInit", lua_likwid_markerThreadInit);
+    lua_register(L, "likwid_markerNextGroup", lua_likwid_markerNext);
+    lua_register(L, "likwid_markerClose", lua_likwid_markerClose);
+    lua_register(L, "likwid_registerRegion", lua_likwid_registerRegion);
+    lua_register(L, "likwid_startRegion", lua_likwid_startRegion);
+    lua_register(L, "likwid_stopRegion", lua_likwid_stopRegion);
+    lua_register(L, "likwid_getRegion", lua_likwid_getRegion);
+    // CPU feature manipulation functions
+    lua_register(L, "likwid_cpuFeaturesInit", lua_likwid_cpuFeatures_init);
+    lua_register(L, "likwid_cpuFeaturesGet", lua_likwid_cpuFeatures_get);
+    lua_register(L, "likwid_cpuFeaturesEnable", lua_likwid_cpuFeatures_enable);
+    lua_register(L, "likwid_cpuFeaturesDisable", lua_likwid_cpuFeatures_disable);
+    // Marker API related functions
+    lua_register(L, "likwid_readMarkerFile", lua_likwid_markerFile_read);
+    lua_register(L, "likwid_destroyMarkerFile", lua_likwid_markerFile_destroy);
+    lua_register(L, "likwid_markerNumRegions", lua_likwid_markerNumRegions);
+    lua_register(L, "likwid_markerRegionGroup", lua_likwid_markerRegionGroup);
+    lua_register(L, "likwid_markerRegionTag", lua_likwid_markerRegionTag);
+    lua_register(L, "likwid_markerRegionEvents", lua_likwid_markerRegionEvents);
+    lua_register(L, "likwid_markerRegionThreads", lua_likwid_markerRegionThreads);
+    lua_register(L, "likwid_markerRegionCpulist", lua_likwid_markerRegionCpulist);
+    lua_register(L, "likwid_markerRegionTime", lua_likwid_markerRegionTime);
+    lua_register(L, "likwid_markerRegionCount", lua_likwid_markerRegionCount);
+    lua_register(L, "likwid_markerRegionResult", lua_likwid_markerRegionResult);
+    lua_register(L, "likwid_markerRegionMetric", lua_likwid_markerRegionMetric);
+#ifdef __MIC__
+    setuid(0);
+    seteuid(0);
+#endif
+    return 0;
+}
diff --git a/src/memsweep.c b/src/memsweep.c
index 8abf796..012c000 100644
--- a/src/memsweep.c
+++ b/src/memsweep.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Implementation of sweeper module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,13 +37,12 @@
 #include <error.h>
 #include <types.h>
 #include <memsweep.h>
-#include <cpuid.h>
+#include <topology.h>
 #include <numa.h>
 #include <affinity.h>
 
 extern void _loadData(uint32_t size, void* ptr);
 
-
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 
@@ -57,14 +56,14 @@ static uint64_t  memoryFraction = 80ULL;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
-static void*
+static void* 
 allocateOnNode(size_t size, int domainId)
 {
-    char *ptr; 
+	char *ptr; 
 
-    ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);  
 
-    if (ptr == (char *)-1)
+	if (ptr == (char *)-1)
     {
         ERROR;
     }
@@ -74,7 +73,7 @@ allocateOnNode(size_t size, int domainId)
     return ptr;
 }
 
-static void
+static void 
 initMemory(size_t size, char* ptr, int domainId)
 {
     affinity_pinProcess(numa_info.nodes[domainId].processors[0]);
@@ -101,20 +100,18 @@ findProcessor(uint32_t nodeId, uint32_t coreId)
 }
 
 /* evict all dirty cachelines from last level cache */
-static void cleanupCache(FILE* OUTSTREAM, char* ptr)
+static void cleanupCache(char* ptr)
 {
-#ifdef __x86_64
+#if defined(__x86_64__) || defined(__i386__)
     uint32_t cachesize = 2 * cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].size;
-    if (OUTSTREAM != NULL)
-    {
-        fprintf(OUTSTREAM, "Cleanup LLC using %u MB\n", cachesize / (1000000));
-    }
+    printf("Cleaning LLC with %g MB\n", (double)cachesize/(1024.0 * 1024.0));
     _loadData(cachesize,ptr);
 #else
-    ERROR_PLAIN_PRINT(Cleanup cache is currently only available on 64bit X86 systems.);
+    ERROR_PLAIN_PRINT(Cleanup cache is currently only available on X86 systems.);
 #endif
 }
 
+
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void
@@ -125,35 +122,32 @@ memsweep_setMemoryFraction(uint64_t fraction)
 
 
 void
-memsweep_node(FILE* OUTSTREAM)
+memsweep_node(void)
 {
     for ( uint32_t i=0; i < numa_info.numberOfNodes; i++)
     {
-        memsweep_domain(OUTSTREAM, i);
+        memsweep_domain(i);
     }
 }
 
 
 void
-memsweep_domain(FILE* OUTSTREAM, int domainId)
+memsweep_domain(int domainId)
 {
     char* ptr = NULL;
     size_t size = numa_info.nodes[domainId].totalMemory * 1024ULL * memoryFraction / 100ULL;
-    if (OUTSTREAM != NULL)
-    {
-        fprintf(OUTSTREAM, "Sweeping domain %d: Using %g MB of %g MB\n",
-                domainId,
-                size / (1000.0 * 1000.0),
-                numa_info.nodes[domainId].totalMemory/ 1000.0);
-    }
+    printf("Sweeping domain %d: Using %g MB of %g MB\n",
+            domainId,
+            size / (1024.0 * 1024.0),
+            numa_info.nodes[domainId].totalMemory/ 1024.0);
     ptr = (char*) allocateOnNode(size, domainId);
     initMemory(size, ptr, domainId);
-    cleanupCache(OUTSTREAM, ptr);
+    cleanupCache(ptr);
     munmap(ptr, size);
 }
 
 void
-memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors)
+memsweep_threadGroup(int* processorList, int numberOfProcessors)
 {
     for (uint32_t i=0; i<numa_info.numberOfNodes; i++)
     {
@@ -161,10 +155,13 @@ memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors
         {
             if (findProcessor(i,processorList[j]))
             {
-                memsweep_domain(OUTSTREAM, i);
+                memsweep_domain(i);
                 break;
             }
         }
     }
 }
 
+
+
+
diff --git a/src/msr.c b/src/msr.c
deleted file mode 100644
index cb867f2..0000000
--- a/src/msr.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  msr.c
- *
- *      Description:  Implementation of msr module.
- *                   Provides API to read and write values to the model
- *                   specific registers on x86 processors using the msr
- *                   sys interface of the Linux 2.6 kernel. This module 
- *                   is based on the msr-util tools.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/wait.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <registers.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define MAX_LENGTH_MSR_DEV_NAME  20
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-static int FD[MAX_NUM_THREADS];
-static int socket_fd = -1;
-static int rdpmc_works = 0;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-static inline int __rdpmc(int counter, uint64_t* value)
-{
-    unsigned low, high;
-    __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
-    *value = ((low) | ((uint64_t )(high) << 32));
-    return 0;
-}
-//Needed for rdpmc check
-void segfault_sigaction(int signal, siginfo_t *si, void *arg)
-{
-    exit(1);
-}
-
-int test_rdpmc(int flag)
-{
-    int ret, waiting;
-    int pid;
-    int status = 0;
-    uint64_t tmp;
-    struct sigaction sa;
-    memset(&sa, 0, sizeof(struct sigaction));
-    sigemptyset(&sa.sa_mask);
-    sa.sa_sigaction = segfault_sigaction;
-    sa.sa_flags   = SA_SIGINFO;
-
-    pid = fork();
-
-    if (pid < 0)
-    {
-        return -1;
-    }
-    if (!pid)
-    {
-        sigaction(SIGSEGV, &sa, NULL);
-        if (flag == 0)
-        {
-            __rdpmc(0, &tmp);
-        }
-        exit(0);
-    } else {
-    
-        waiting = waitpid(pid, &status, 0);
-        if (waiting < 0 || status)
-        {
-            ret = 0;
-        } else 
-        {
-            ret = 1;
-        }
-    }
-    return ret;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-
-void
-msr_init(int initSocket_fd)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        char* msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
-
-        sprintf(msr_file_name,"/dev/msr0");
-        if( access( msr_file_name, F_OK ) == -1 )
-        {
-            sprintf(msr_file_name,"/dev/cpu/0/msr");
-        }
-
-        if (access(msr_file_name, R_OK|W_OK))
-        {
-            ERROR_PRINT(Cannot access MSR device file %s: %s.\n
-                        Please check if 'msr' module is loaded and device files have correct permissions\n
-                        Alternatively you might want to look into (sys)daemonmode\n,msr_file_name , strerror(errno));
-            free(msr_file_name);
-            exit(127);
-        }
-        rdpmc_works = test_rdpmc(0);
-
-        /* NOTICE: This assumes consecutive processor Ids! */
-        for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
-        {
-            sprintf(msr_file_name,"/dev/msr%d",i);
-            if( access( msr_file_name, F_OK ) == -1 )
-            {
-                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-            }
-            FD[i] = open(msr_file_name, O_RDWR);
-            if ( FD[i] < 0 )
-            {
-                ERROR_PRINT(Cannot access MSR device file %s: %s\n,
-                                msr_file_name , strerror(errno));
-                free(msr_file_name);
-                ERROR;
-            }
-        }
-        free(msr_file_name);
-    }
-    else
-    {
-        socket_fd = initSocket_fd;
-    }
-}
-
-void
-msr_finalize(void)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
-        {
-            close(FD[i]);
-        }
-        rdpmc_works = 0;
-    }
-    else
-    {
-        socket_fd = -1;
-    }
-}
-
-
-uint64_t 
-msr_tread(const int tsocket_fd, const int cpu, uint32_t reg)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
-    {
-        uint64_t data;
-
-        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
-        {
-            if (__rdpmc(reg - MSR_PMC0, &data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
-            }
-        }
-        else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
-        {
-            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
-            }
-        }
-        else
-        {
-            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
-                        reg, cpu);
-            }
-        }
-
-        return data;
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        return accessClient_read(tsocket_fd, cpu, DAEMON_AD_MSR, reg);
-    }
-}
-
-
-void 
-msr_twrite(const int tsocket_fd, const int cpu, uint32_t reg, uint64_t data)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
-    {
-        if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
-        {
-            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
-                        reg, cpu);
-        }
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        accessClient_write(tsocket_fd, cpu, DAEMON_AD_MSR, reg, data);
-    }
-}
-
-
-uint64_t 
-msr_read( const int cpu, uint32_t reg)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
-    {
-        uint64_t data;
-
-        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
-        {
-            if (__rdpmc(reg - MSR_PMC0, &data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
-            }
-        }
-        else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
-        {
-            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
-            }
-        }
-        else
-        {
-            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
-            {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
-                        reg, cpu);
-            }
-        }
-
-        return data;
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        return accessClient_read(socket_fd, cpu, DAEMON_AD_MSR, reg);
-    }
-}
-
-
-void
-msr_write( const int cpu, uint32_t reg, uint64_t data)
-{
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
-    {
-        if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
-        {
-            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
-                        reg, cpu);
-        }
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        accessClient_write(socket_fd, cpu, DAEMON_AD_MSR, reg, data);
-    }
-}
-
-
diff --git a/src/multiplex.c b/src/multiplex.c
deleted file mode 100644
index 68a6b88..0000000
--- a/src/multiplex.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex.c
- *
- *      Description:  
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <sys/time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <multiplex.h>
-
-#if 0
-static int currentCollection = -1;
-static MultiplexCollections* multiplex_set = NULL;
-static TimerData timeData;
-static int  multiplex_useMarker = 0;
-
-void
-multiplex_printCounters ()
-{
-
-
-
-}
-
-
-
-void
-multiplex_swapEventSet ()
-{
-    int threadId;
-    PerfmonEventSet* collection;
-
-    /* collection from last run */
-    collection = multiplex_set->collections + currentCollection;
-
-    for (threadId = 0; threadId < perfmon_numThreads; threadId++)
-    {
-        /* Stop counters */
-        if (!multiplex_useMarker) perfmon_stopCountersThread(threadId);
-        /* Accumulate counters */
-        for (int i=0; i<collection->numberOfEvents; i++)
-        {
-//            collection->events[i].result[threadId] += 
- //               (double) perfmon_threadData[threadId].counters[collection->events[i].index].counterData;
-        }
-    }
-
-    /* switch to next collection */
-    if( currentCollection == multiplex_set->numberOfCollections-1)
-    {
-        currentCollection = 0;
-    }
-    else
-    {
-        currentCollection++;
-    }
-    collection = multiplex_set->collections + currentCollection;
-
-    for (threadId = 0; threadId < perfmon_numThreads; threadId++)
-    {
-        /* Reconfigure counters */
-        for (int i=0; i<collection->numberOfEvents; i++)
-        {
-            perfmon_setupCounterThread(threadId,
-                    collection->events[i].event.eventId,
-                    collection->events[i].event.umask,
-                    collection->events[i].index);
-        }
-
-        /* Start counters */
-       if (!multiplex_useMarker)  perfmon_startCountersThread(threadId);
-    }
-}
-
-void
-multiplex_init(MultiplexCollections* set)
-{
-    int i;
-
-    multiplex_set = set;
-
-    for (i=0;i<multiplex_set->numberOfCollections; i++)
-    {
-//        perfmon_initEventset(multiplex_set->collections+i);
-    }
-}
-
-void
-multiplex_start()
-{
-    struct itimerval val;
-    struct sigaction sa;
-
-//    multiplex_useMarker = useMarker;
-
-    val.it_interval.tv_sec = 0;
-    val.it_interval.tv_usec = 500;
-    val.it_value.tv_sec = 0; 
-    val.it_value.tv_usec = 100;
-
-    sa.sa_handler = multiplex_printCounters;
-    sigemptyset(&sa.sa_mask);
-    sa.sa_flags = SA_RESTART;
-    if (sigaction(SIGALRM, &sa, NULL) == -1)
-    {
-        /* Handle error */;
-        perror("sigaction");
-    }
-
-    perfmon_startCounters();
-    setitimer(ITIMER_REAL, &val,0);
-    timer_start(&timeData);
-}
-
-void
-multiplex_stop()
-{
-    struct itimerval val;
-
-    val.it_interval.tv_sec = 0;
-    val.it_interval.tv_usec = 0;
-    val.it_value.tv_sec = 0; 
-    val.it_value.tv_usec = 0;
-
-    timer_stop(&timeData);
-    setitimer(ITIMER_REAL, &val,0);
-    perfmon_stopCounters();
-
-    multiplex_set->time = timer_print(&timeData);
-}
-
-#endif
-
-
diff --git a/src/numa.c b/src/numa.c
index 2f72765..09459d3 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -3,15 +3,17 @@
  *
  *      Filename:  numa.c
  *
- *      Description:  Implementation of Linux NUMA interface
+ *      Description:  Implementation of Linux NUMA interface. Selects between hwloc and
+ *                    procfs/sysfs backends.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,352 +39,194 @@
 #include <sched.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <error.h>
 #include <dirent.h>
 #ifdef HAS_MEMPOLICY
 #include <linux/mempolicy.h>
 #endif
+#include <topology.h>
+
+#include <configuration.h>
 
 #include <error.h>
 #include <bstrlib.h>
+
 #include <numa.h>
-#include <strUtil.h>
+#include <numa_proc.h>
 
-/* #####   EXPORTED VARIABLES   ########################################### */
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#include <numa_hwloc.h>
+#endif
 
 
-NumaTopology numa_info;
+/* #####   EXPORTED VARIABLES   ########################################### */
+NumaTopology numa_info = {0,NULL};
+static int numaInitialized = 0;
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#ifdef HAS_MEMPOLICY
-#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
-#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
-#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
-#endif
-
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int maxIdConfiguredNode = 0;
-
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
-static void
-setConfiguredNodes(void)
+int str2int(const char* str)
 {
-    DIR *dir;
-    struct dirent *de;
-
-    dir = opendir("/sys/devices/system/node");
+    char* endptr;
+    errno = 0;
+    unsigned long val;
+    val = strtoul(str, &endptr, 10);
 
-    if (!dir) 
+    if ((errno == ERANGE && val == LONG_MAX)
+        || (errno != 0 && val == 0))
     {
-        maxIdConfiguredNode = 0;
+        fprintf(stderr, "Value in string out of range\n");
+        return -EINVAL;
     }
-    else
-    {
-        while ((de = readdir(dir)) != NULL) 
-        {
-            int nd;
-            if (strncmp(de->d_name, "node", 4))
-            {
-                continue;
-            }
 
-            nd = str2int(de->d_name+4);
-
-            if (maxIdConfiguredNode < nd)
-            {
-                maxIdConfiguredNode = nd;
-            }
-        }
-        closedir(dir);
-    }
-}
-
-
-static void
-nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
-{
-    FILE *fp;
-    bstring filename;
-    bstring totalString = bformat("MemTotal:");
-    bstring freeString  = bformat("MemFree:");
-    int i;
-
-    filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
-
-    if (NULL != (fp = fopen (bdata(filename), "r"))) 
-    {
-        bstring src = bread ((bNread) fread, fp);
-        struct bstrList* tokens = bsplit(src,(char) '\n');
-
-        for (i=0;i<tokens->qty;i++)
-        {
-            if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
-            {
-                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
-                 bltrimws(tmp);
-                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
-                 *totalMemory = str2int(bdata(subtokens->entry[0]));
-            }
-            else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
-            {
-                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
-                 bltrimws(tmp);
-                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
-                 *freeMemory = str2int(bdata(subtokens->entry[0]));
-            }
-        }
-    }
-    else
+    if (endptr == str)
     {
-        ERROR;
+        fprintf(stderr, "No digits were found\n");
+        return -EINVAL;
     }
 
-    fclose(fp);
+    return (int) val;
 }
 
-static int
-nodeProcessorList(int node, uint32_t** list)
-{
-    FILE *fp;
-    bstring filename;
-    int count = 0;
-    bstring src;
-    int i,j;
-    struct bstrList* tokens;
-    unsigned long val;
-    char* endptr;
-    int cursor=0;
-//    int unitSize = (int) (sizeof(unsigned long)*8);
-    int unitSize = (int) 32; /* 8 nibbles */
-
-    *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
-
-    /* the cpumap interface should be always there */
-    filename = bformat("/sys/devices/system/node/node%d/cpumap", node); 
-
-    if (NULL != (fp = fopen (bdata(filename), "r"))) 
-    {
-
-        src = bread ((bNread) fread, fp);
-        tokens = bsplit(src,',');
-
-        for (i=(tokens->qty-1); i>=0 ;i--)
-        {
-            val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
-
-            if ((errno != 0 && val == LONG_MAX )
-                    || (errno != 0 && val == 0)) 
-            {
-                ERROR;
-            }
-
-            if (endptr == (char*) tokens->entry[i]->data) 
-            {
-                ERROR_PLAIN_PRINT(No digits were found);
-            }
-
-            if (val != 0UL)
-            {
-                for (j=0; j<unitSize; j++)
-                {
-                    if (val&(1UL<<j))
-                    {
-                        if (count < MAX_NUM_THREADS)
-                        {
-                            (*list)[count] = (j+cursor);
-                        }
-                        else
-                        {
-                            ERROR_PRINT(Number Of threads %d too large,count);
-                        }
-                        count++;
-                    }
-                }
-            }
-            cursor += unitSize;
-        }
-
-        bstrListDestroy(tokens);
-        bdestroy(src);
-        bdestroy(filename);
-        fclose(fp); 
-
-        /* FIXME: CPU list here is not physical cores first but numerical sorted */
-
-        return count;
-    }
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-    /* something went wrong */
-    return -1;
-}
- 
-static int
-nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
+int
+empty_numa_init()
 {
-    FILE *fp;
-    bstring filename;
-    int count = 0;
-    bstring src;
-    struct bstrList* tokens;
-
-    *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
-
-    /* the distance interface should be always there */
-    filename = bformat("/sys/devices/system/node/node%d/distance", node);
-
-    if (NULL != (fp = fopen (bdata(filename), "r")))
-    {
-
-        src = bread ((bNread) fread, fp);
-        tokens = bsplit(src,' ');
-
-        for (int i=0; i<(tokens->qty); i++)
-        {
-            if (count < numberOfNodes)
-            {
-                (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
-            }
-            else
-            {
-                ERROR_PRINT(Number Of nodes %d too large,count);
-            }
-            count++;
-        }
-
-        bstrListDestroy(tokens);
-        bdestroy(src);
-        bdestroy(filename);
-        fclose(fp);
-        return count;
-    }
-
-    /* something went wrong */
-    return -1;
+    printf("MEMPOLICY NOT supported in kernel!\n");
+    return 0;
 }
 
-
-
-static int
-findProcessor(uint32_t nodeId, uint32_t coreId)
+void 
+empty_numa_setInterleaved(int* processorList, int numberOfProcessors)
 {
-    int i;
+    printf("MEMPOLICY NOT supported in kernel!\n");
+    return;
+}
 
-    for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
-    {
-        if (numa_info.nodes[nodeId].processors[i] == coreId)
-        {
-            return 1;
-        }
-    }
-    return 0;
+void
+empty_numa_membind(void* ptr, size_t size, int domainId)
+{
+    printf("MBIND NOT supported in kernel!\n");
+    return;
 }
 
 
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+const struct numa_functions numa_funcs = {
+#ifndef HAS_MEMPOLICY
+    .numa_init = empty_numa_init,
+    .numa_setInterleaved = empty_numa_setInterleaved,
+    .numa_membind = empty_numa_membind
+#else
+#ifdef LIKWID_USE_HWLOC
+    .numa_init = hwloc_numa_init,
+#else
+    .numa_init = proc_numa_init,
+#endif
+    .numa_setInterleaved = proc_numa_setInterleaved,
+    .numa_membind = proc_numa_membind
+#endif
+};
 
-#ifdef HAS_MEMPOLICY
-int
-numa_init()
-{
-    int errno;
-    uint32_t i;
 
-    if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+int numa_init(void)
+{
+    const struct numa_functions funcs = numa_funcs;
+    int ret = 0;
+    if (init_config == 0)
     {
-        return -1; 
+        init_configuration();
     }
-
-    /* First determine maximum number of nodes */
-    setConfiguredNodes();
-    numa_info.numberOfNodes = maxIdConfiguredNode+1;
-    numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
-
-    for (i=0; i<numa_info.numberOfNodes; i++)
+    if (numaInitialized == 1)
     {
-        nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
-        numa_info.nodes[i].id = i;
-        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
-        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+        return 0;
     }
 
-    if (numa_info.nodes[0].numberOfProcessors < 0)
+    if ((config.topologyCfgFileName != NULL) && (!access(config.topologyCfgFileName, R_OK)) && (numa_info.nodes != NULL))
     {
-        return -1;
+        /* If we read in the topology file, the NUMA related stuff is already initialized */
+        numaInitialized = 1;
+        return 0;
     }
     else
     {
-        return 0;
+        cpu_set_t cpuSet;
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        if (cpuid_topology.activeHWThreads < cpuid_topology.numHWThreads)
+        {
+            ret = proc_numa_init();
+        }
+        else
+        {
+            ret = funcs.numa_init();
+        }
+        if (ret == 0)
+            numaInitialized = 1;
     }
+    return ret;
 }
 
-void 
-numa_setInterleaved(int* processorList, int numberOfProcessors)
+void numa_setInterleaved(int* processorList, int numberOfProcessors)
 {
-    long i;
-    int j;
-    int ret=0;
-    unsigned long numberOfNodes = 65;
-    unsigned long mask = 0UL;
+    const struct numa_functions funcs = numa_funcs;
+    return funcs.numa_setInterleaved(processorList, numberOfProcessors);
+}
+
+void numa_membind(void* ptr, size_t size, int domainId)
+{
+    const struct numa_functions funcs = numa_funcs;
+    return funcs.numa_membind(ptr, size, domainId);
+}
 
-    for (i=0; i<numa_info.numberOfNodes; i++)
+#ifndef HAS_MEMPOLICY
+void numa_finalize(void)
+{
+    return;
+}
+#else
+void numa_finalize(void)
+{
+    int i;
+    if (!numaInitialized)
     {
-        for (j=0; j<numberOfProcessors; j++)
+        return;
+    }
+    for(i=0;i<numa_info.numberOfNodes;i++)
+    {
+        if (numa_info.nodes[i].processors)
+        {
+            free(numa_info.nodes[i].processors);
+        }
+        if (numa_info.nodes[i].distances)
         {
-            if (findProcessor(i,processorList[j]))
-            {
-                mask |= (1UL<<i);
-                break;
-            }
+            free(numa_info.nodes[i].distances);
         }
+        numa_info.nodes[i].id = 0;
+        numa_info.nodes[i].totalMemory = 0;
+        numa_info.nodes[i].freeMemory = 0;
+        numa_info.nodes[i].numberOfProcessors = 0;
+        numa_info.nodes[i].numberOfDistances = 0;
     }
-
-    ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
-
-    if (ret < 0)
+    if (numa_info.nodes)
     {
-        ERROR;
+        free(numa_info.nodes);
     }
+    numa_info.numberOfNodes = 0;
+    numaInitialized = 0;
+    return;
 }
 
-void
-numa_membind(void* ptr, size_t size, int domainId)
+int likwid_getNumberOfNodes()
 {
-    int ret=0;
-    unsigned long mask = 0UL;
-    unsigned int flags = 0U;
-
-    flags |= MPOL_MF_STRICT;
-    mask |= (1UL<<domainId);
-
-    ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
-
-    if (ret < 0)
+    if (numaInitialized)
     {
-        ERROR;
+        return numa_info.numberOfNodes;
     }
+    return 0;
 }
-
-#else
-int
-numa_init()
-{
-    printf("MEMPOLICY NOT supported in kernel!\n");
-}
-
-void 
-numa_setInterleaved(int* processorList, int numberOfProcessors)
-{
-    printf("MEMPOLICY NOT supported in kernel!\n");
-}
-
-void
-numa_membind(void* ptr, size_t size, int domainId)
-{
-    printf("MBIND NOT supported in kernel!\n");
-}
-
 #endif
-
-
diff --git a/src/numa_hwloc.c b/src/numa_hwloc.c
new file mode 100644
index 0000000..94639fc
--- /dev/null
+++ b/src/numa_hwloc.c
@@ -0,0 +1,415 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_hwloc.c
+ *
+ *      Description:  Interface to hwloc for NUMA topology
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <error.h>
+
+#include <numa.h>
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+#ifdef LIKWID_USE_HWLOC
+uint64_t getFreeNodeMem(int nodeId)
+{
+    FILE *fp;
+    bstring filename;
+    uint64_t free = 0;
+    bstring freeString  = bformat("MemFree:");
+    int i;
+    
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 free = str2int(bdata(subtokens->entry[0]));
+                 bdestroy(tmp);
+                 bstrListDestroy(subtokens);
+            }
+        }
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        fclose(fp);
+    }
+    else if (!access("/proc/meminfo", R_OK))
+    {
+        bdestroy(filename);
+        filename = bfromcstr("/proc/meminfo");
+        if (NULL != (fp = fopen (bdata(filename), "r"))) 
+        {
+            bstring src = bread ((bNread) fread, fp);
+            struct bstrList* tokens = bsplit(src,(char) '\n');
+            for (i=0;i<tokens->qty;i++)
+            {
+                if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+                {
+                     bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10  );
+                     bltrimws(tmp);
+                     struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                     free = str2int(bdata(subtokens->entry[0]));
+                     bdestroy(tmp);
+                     bstrListDestroy(subtokens);
+                }
+            }
+            bstrListDestroy(tokens);
+            bdestroy(src);
+            fclose(fp);
+        }
+    }
+    else
+    {
+        bdestroy(freeString);
+        bdestroy(filename);
+        ERROR;
+    }
+    bdestroy(freeString);
+    bdestroy(filename);
+    return free;
+    
+}
+
+uint64_t getTotalNodeMem(int nodeId)
+{
+    int i;
+    FILE *fp;
+    uint64_t total = 0;
+    bstring totalString  = bformat("MemTotal:");
+    bstring sysfilename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+    bstring procfilename = bformat("/proc/meminfo");
+
+    if (NULL != (fp = fopen (bdata(sysfilename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 total = str2int(bdata(subtokens->entry[0]));
+                 bdestroy(tmp);
+                 bstrListDestroy(subtokens);
+            }
+        }
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        fclose(fp);
+    }
+    else if (!access(bdata(procfilename), R_OK))
+    {
+        if (NULL != (fp = fopen (bdata(procfilename), "r"))) 
+        {
+            bstring src = bread ((bNread) fread, fp);
+            struct bstrList* tokens = bsplit(src,(char) '\n');
+            for (i=0;i<tokens->qty;i++)
+            {
+                if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+                {
+                     bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10  );
+                     bltrimws(tmp);
+                     struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                     total = str2int(bdata(subtokens->entry[0]));
+                     bdestroy(tmp);
+                     bstrListDestroy(subtokens);
+                }
+            }
+            bstrListDestroy(tokens);
+            bdestroy(src);
+            fclose(fp);
+        }
+    }
+    else
+    {
+        bdestroy(totalString);
+        bdestroy(sysfilename);
+        bdestroy(procfilename);
+        ERROR;
+    }
+
+    bdestroy(totalString);
+    bdestroy(sysfilename);
+    bdestroy(procfilename);
+    return total;
+}
+
+int likwid_hwloc_findProcessor(int nodeID, int cpuID)
+{
+    hwloc_obj_t obj;
+    int i;
+    int pu_count = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    
+    for (i=0; i<pu_count; i++)
+    {
+        obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+        if (!obj)
+        {
+            continue;
+        }
+        else
+        {
+            if (obj->os_index == cpuID)
+            {
+                return 1;
+            }
+        }
+    }
+    return 0;
+
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+int hwloc_numa_init(void)
+{
+    int errno;
+    uint32_t i;
+    int d;
+    int depth;
+    int cores_per_socket;
+    hwloc_obj_t obj;
+    const struct hwloc_distances_s* distances;
+    hwloc_obj_type_t hwloc_type = HWLOC_OBJ_NODE;
+
+    if (!hwloc_topology)
+    {
+        likwid_hwloc_topology_init(&hwloc_topology);
+        likwid_hwloc_topology_load(hwloc_topology);
+    }
+
+    numa_info.numberOfNodes = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type);
+
+    /* If the amount of NUMA nodes == 0, there is actually no NUMA node, hence
+       aggregate all sockets in the system into the single virtually created NUMA node */
+    if (numa_info.numberOfNodes == 0)
+    {
+        hwloc_type = HWLOC_OBJ_SOCKET;
+        numa_info.numberOfNodes = 1;
+
+        numa_info.nodes = (NumaNode*) malloc(sizeof(NumaNode));
+        if (!numa_info.nodes)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",sizeof(NumaNode));
+            return -1;
+        }
+        
+        numa_info.nodes[0].id = 0;
+        numa_info.nodes[0].numberOfProcessors = 0;
+        numa_info.nodes[0].totalMemory = getTotalNodeMem(0);
+        numa_info.nodes[0].freeMemory = getFreeNodeMem(0);
+        numa_info.nodes[0].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+        if (!numa_info.nodes[0].processors)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t),0);
+            return -1;
+        }
+        numa_info.nodes[0].distances = (uint32_t*) malloc(sizeof(uint32_t));
+        if (!numa_info.nodes[0].distances)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",sizeof(uint32_t),0);
+            return -1;
+        }
+        numa_info.nodes[0].distances[0] = 10;
+        numa_info.nodes[0].numberOfDistances = 1;
+        cores_per_socket = cpuid_topology.numHWThreads/cpuid_topology.numSockets;
+        
+        for (d=0; d<likwid_hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type); d++)
+        {
+            obj = likwid_hwloc_get_obj_by_type(hwloc_topology, hwloc_type, d);
+            /* depth is here used as index in the processors array */        
+            depth = d * cores_per_socket;
+            numa_info.nodes[0].numberOfProcessors += likwid_hwloc_record_objs_of_type_below_obj(
+                    likwid_hwloc_topology, obj, HWLOC_OBJ_PU, &depth, &numa_info.nodes[0].processors);
+        }
+    }
+    else
+    {
+        numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+        if (!numa_info.nodes)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",numa_info.numberOfNodes * sizeof(NumaNode));
+            return -1;
+        }
+        depth = likwid_hwloc_get_type_depth(hwloc_topology, hwloc_type);
+        distances = likwid_hwloc_get_whole_distance_matrix_by_type(hwloc_topology, hwloc_type);
+        for (i=0; i<numa_info.numberOfNodes; i++)
+        {
+            obj = likwid_hwloc_get_obj_by_depth(hwloc_topology, depth, i);
+
+            numa_info.nodes[i].id = obj->os_index;
+
+            if (obj->memory.local_memory != 0)
+            {
+                numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.local_memory/1024);
+            }
+            else if (obj->memory.total_memory != 0)
+            {
+                numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.total_memory/1024);
+            }
+            else
+            {
+                numa_info.nodes[i].totalMemory = getTotalNodeMem(numa_info.nodes[i].id);
+            }
+            
+            /* freeMemory not detected by hwloc, do it the native way */
+            numa_info.nodes[i].freeMemory = getFreeNodeMem(numa_info.nodes[i].id);
+            numa_info.nodes[i].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+            if (!numa_info.nodes[i].processors)
+            {
+                fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t), i);
+                return -1;
+            }
+            d = 0;
+            numa_info.nodes[i].numberOfProcessors = likwid_hwloc_record_objs_of_type_below_obj(
+                    hwloc_topology, obj, HWLOC_OBJ_PU, &d, &numa_info.nodes[i].processors);
+            
+            numa_info.nodes[i].distances = (uint32_t*) malloc(numa_info.numberOfNodes * sizeof(uint32_t));
+            if (!numa_info.nodes[i].distances)
+            {
+                fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",numa_info.numberOfNodes*sizeof(uint32_t),i);
+                return -1;
+            }
+            if (distances)
+            {
+                numa_info.nodes[i].numberOfDistances = distances->nbobjs;
+                for(d=0;d<distances->nbobjs;d++)
+                {
+                    numa_info.nodes[i].distances[d] = distances->latency[i*distances->nbobjs + d] * distances->latency_base;
+                }
+            }
+            else
+            {
+                numa_info.nodes[i].numberOfDistances = numa_info.numberOfNodes;
+                for(d=0;d<numa_info.numberOfNodes;d++)
+                {
+                    numa_info.nodes[i].distances[d] = 10;
+                }
+            }
+
+        }
+    
+    }
+
+    if (numa_info.nodes[0].numberOfProcessors == 0)
+    {
+        return -1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    int ret = 0;
+    hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+    hwloc_nodeset_t nodeset = likwid_hwloc_bitmap_alloc();
+    
+    likwid_hwloc_bitmap_zero(nodeset);
+    likwid_hwloc_bitmap_set(nodeset, domainId);
+    
+    ret = likwid_hwloc_set_area_membind_nodeset(hwloc_topology, ptr, size, nodeset, HWLOC_MEMBIND_BIND, flags);
+    
+    likwid_hwloc_bitmap_free(nodeset);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+
+
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    int i,j;
+    int ret = 0;
+    likwid_hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
+    likwid_hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+    
+    likwid_hwloc_bitmap_zero(cpuset);
+    
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        for (j=0; j<numberOfProcessors; j++)
+        {
+            if (likwid_hwloc_findProcessor(i,processorList[j]))
+            {
+                likwid_hwloc_bitmap_set(cpuset, i);
+            }
+        }
+    }
+    
+    
+    ret = likwid_hwloc_set_membind(hwloc_topology, cpuset, HWLOC_MEMBIND_INTERLEAVE, flags);
+    
+    likwid_hwloc_bitmap_free(cpuset);
+    
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+#else
+int hwloc_numa_init(void)
+{
+    return 1;
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    return;
+}
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    return;
+}
+
+#endif
diff --git a/src/numa_proc.c b/src/numa_proc.c
new file mode 100644
index 0000000..a17d824
--- /dev/null
+++ b/src/numa_proc.c
@@ -0,0 +1,383 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_proc.c
+ *
+ *      Description:  Get NUMA topology from procfs and sysfs
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+ 
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <dirent.h>
+#include <error.h>
+//#include <strUtil.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifdef HAS_MEMPOLICY
+#include <linux/mempolicy.h>
+#endif
+
+#include <numa.h>
+#include <topology.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#ifdef HAS_MEMPOLICY
+#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
+#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
+#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
+#endif
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+int
+proc_findProcessor(uint32_t nodeId, uint32_t coreId)
+{
+    int i;
+
+    for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
+    {
+        if (numa_info.nodes[nodeId].processors[i] == coreId)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int
+setConfiguredNodes(void)
+{
+    DIR *dir;
+    struct dirent *de;
+    int maxIdConfiguredNode = 0;
+
+    dir = opendir("/sys/devices/system/node");
+
+    if (!dir) 
+    {
+        maxIdConfiguredNode = 0;
+    }
+    else
+    {
+        while ((de = readdir(dir)) != NULL) 
+        {
+            int nd;
+            if (strncmp(de->d_name, "node", 4))
+            {
+                continue;
+            }
+
+            nd = str2int(de->d_name+4);
+
+            if (maxIdConfiguredNode < nd)
+            {
+                maxIdConfiguredNode = nd;
+            }
+        }
+        closedir(dir);
+    }
+    return maxIdConfiguredNode;
+}
+
+
+static void
+nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
+{
+    FILE *fp;
+    bstring filename;
+    bstring totalString = bformat("MemTotal:");
+    bstring freeString  = bformat("MemFree:");
+    int i;
+
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 *totalMemory = str2int(bdata(subtokens->entry[0]));
+                 bstrListDestroy(subtokens);
+                 bdestroy(tmp);
+            }
+            else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 *freeMemory = str2int(bdata(subtokens->entry[0]));
+                 bstrListDestroy(subtokens);
+                 bdestroy(tmp);
+            }
+        }
+        bdestroy(src);
+        bstrListDestroy(tokens);
+    }
+    else
+    {
+        bdestroy(filename);
+        bdestroy(totalString);
+        bdestroy(freeString);
+        ERROR;
+    }
+    bdestroy(filename);
+    bdestroy(totalString);
+    bdestroy(freeString);
+    fclose(fp);
+}
+
+static int
+nodeProcessorList(int node, uint32_t** list)
+{
+    FILE *fp;
+    bstring filename;
+    int count = 0;
+    bstring src;
+    int i,j;
+    struct bstrList* tokens;
+    unsigned long val;
+    char* endptr;
+    int cursor=0;
+//    int unitSize = (int) (sizeof(unsigned long)*8);
+    int unitSize = (int) 32; /* 8 nibbles */
+
+    *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+    if (!(*list))
+    {
+        return -ENOMEM;
+    }
+
+    /* the cpumap interface should be always there */
+    filename = bformat("/sys/devices/system/node/node%d/cpumap", node); 
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+
+        src = bread ((bNread) fread, fp);
+        tokens = bsplit(src,',');
+
+        for (i=(tokens->qty-1); i>=0 ;i--)
+        {
+            val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
+
+            if ((errno != 0 && val == LONG_MAX )
+                    || (errno != 0 && val == 0)) 
+            {
+                return -EFAULT;
+            }
+
+            if (endptr == (char*) tokens->entry[i]->data) 
+            {
+                ERROR_PLAIN_PRINT(No digits were found);
+                return -EFAULT;
+            }
+
+            if (val != 0UL)
+            {
+                for (j=0; j<unitSize; j++)
+                {
+                    if (val&(1UL<<j))
+                    {
+                        if (count < MAX_NUM_THREADS)
+                        {
+                            (*list)[count] = (j+cursor);
+                        }
+                        else
+                        {
+                            ERROR_PRINT(Number Of threads %d too large,count);
+                            return -EFAULT;
+                        }
+                        count++;
+                    }
+                }
+            }
+            cursor += unitSize;
+        }
+
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        bdestroy(filename);
+        fclose(fp); 
+
+        /* FIXME: CPU list here is not physical cores first but numerical sorted */
+
+
+        return count;
+    }
+
+    /* something went wrong */
+    return -1;
+}
+ 
+static int
+nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
+{
+    FILE *fp;
+    bstring filename;
+    int count = 0;
+    bstring src;
+    struct bstrList* tokens;
+
+    *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
+    if (!(*list))
+    {
+        return -ENOMEM;
+    }
+
+    /* the distance interface should be always there */
+    filename = bformat("/sys/devices/system/node/node%d/distance", node);
+
+    if (NULL != (fp = fopen (bdata(filename), "r")))
+    {
+
+        src = bread ((bNread) fread, fp);
+        tokens = bsplit(src,' ');
+
+        for (int i=0; i<(tokens->qty); i++)
+        {
+            if (count < numberOfNodes)
+            {
+                (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
+            }
+            else
+            {
+                ERROR_PRINT(Number Of nodes %d too large,count);
+                return -EFAULT;
+            }
+            count++;
+        }
+
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        bdestroy(filename);
+        fclose(fp);
+        return count;
+    }
+
+    /* something went wrong */
+    return -1;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int proc_numa_init(void)
+{
+    int errno;
+    uint32_t i;
+
+    if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+    {
+        numa_info.numberOfNodes = 0;
+        numa_info.nodes = NULL;
+        return -1; 
+    }
+    /* First determine maximum number of nodes */
+    numa_info.numberOfNodes = setConfiguredNodes()+1;
+    numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+    if (!numa_info.nodes)
+    {
+        return -ENOMEM;
+    }
+
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        numa_info.nodes[i].id = i;
+        nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
+        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
+        if (numa_info.nodes[i].numberOfProcessors == 0)
+        {
+            return -EFAULT;
+        }
+        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+        if (numa_info.nodes[i].numberOfDistances == 0)
+        {
+            return -EFAULT;
+        }
+    }
+
+    return 0;
+}
+
+void 
+proc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    long i;
+    int j;
+    int ret=0;
+    unsigned long numberOfNodes = 65;
+    unsigned long mask = 0UL;
+
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        for (j=0; j<numberOfProcessors; j++)
+        {
+            if (proc_findProcessor(i,processorList[j]))
+            {
+                mask |= (1UL<<i);
+                break;
+            }
+        }
+    }
+
+    ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+
+void
+proc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    int ret=0;
+    unsigned long mask = 0UL;
+    unsigned int flags = 0U;
+
+    flags |= MPOL_MF_STRICT;
+    mask |= (1UL<<domainId);
+
+    ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
diff --git a/src/pci.c b/src/pci.c
deleted file mode 100644
index 2e8a22f..0000000
--- a/src/pci.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  pci.c
- *
- *      Description:  Implementation of pci module.
- *                   Provides API to read and write values to the hardware
- *                   performance monitoring registers in PCI Cfg space
- *                   for Intel Sandy Bridge Processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include <types.h>
-#include <accessClient.h>
-#include <bstrlib.h>
-#include <error.h>
-#include <pci.h>
-#include <cpuid.h>
-#include <affinity.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-#define PCI_ROOT_PATH  "/proc/bus/pci/"
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int socket_fd = -1;
-static int FD[MAX_NUM_NODES][MAX_NUM_DEVICES];
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5",   /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6",   /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1",   /* PCI_R2PCIE_DEVICE */
- "10.0",   /* PCI_IMC_DEVICE_CH_0 */
- "10.1",   /* PCI_IMC_DEVICE_CH_1 */
- "10.4",   /* PCI_IMC_DEVICE_CH_2 */
- "10.5",   /* PCI_IMC_DEVICE_CH_3 */
- "0e.1",   /* PCI_HA_DEVICE */
- "08.2",   /* PCI_QPI_DEVICE_PORT_0 */
- "09.2",   /* PCI_QPI_DEVICE_PORT_1 */
- "08.6",   /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6",   /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0",   /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
-
-/* Socket to bus mapping -- will be determined at runtime;
- * typical mappings are:
- * Socket  Bus (2S)  Bus (4s)
- *   0        0xff      0x3f
- *   1        0x7f      0x7f
- *   2                  0xbf
- *   3                  0xff
- */
-static char* socket_bus[MAX_NUM_NODES];
-static int socket_count = 0;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-pci_init(int initSocket_fd)
-{
-    FILE *fptr;
-    char buf[1024];
-    uint32_t testDevice;
-    uint32_t sbus, sdevfn, svend;
-    int cntr = 0;
-    int active_devs = 0;
-
-    for ( int j=0; j<MAX_NUM_NODES; j++ )
-    {
-        socket_bus[j] = "N-A";
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
-        {
-            FD[j][i] = 0;
-        }
-    }
-
-    if (cpuid_info.model == SANDYBRIDGE_EP)
-    {
-        testDevice = 0x80863c44;
-    }
-    else if (cpuid_info.model == IVYBRIDGE_EP)
-    {
-        testDevice = 0x80860e36;
-    }
-    else
-    {
-        /*
-        fprintf(stderr, "Unsupported architecture for pci based uncore. \
-                Thus, no support for PCI based Uncore counters.\n");
-                */
-        return;
-    }
-
-    if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
-    {
-        fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
-                Thus, no support for PCI based Uncore counters.\n");
-        return;
-    }
-
-    while( fgets(buf, sizeof(buf)-1, fptr) )
-    {
-        if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
-             svend == testDevice )
-        {
-            socket_bus[cntr] = (char*)malloc(4);
-            sprintf(socket_bus[cntr++], "%02x/", sbus);
-        }
-    }
-    fclose(fptr);
-
-    if ( cntr == 0 )
-    {
-        fprintf(stderr, "Uncore not supported on this system\n");
-        return;
-    }
-
-    socket_count = cntr;
-
-    bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-    bcatcstr(filepath, socket_bus[0]);
-    bcatcstr(filepath, pci_DevicePath[0] );
-
-
-    if (access(bdata(filepath),F_OK))
-    {
-        fprintf(stderr, "INFO\n");
-        fprintf(stderr, "This system has no support for PCI based Uncore counters.\n");
-        fprintf(stderr, "This means you cannot use performance groups as MEM, which require Uncore counters.\n\n");
-        return;
-    }
-    bdestroy(filepath);
-
-    for (int j=0; j<socket_count; j++)
-    {
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
-        {
-
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[j]);
-            bcatcstr(filepath, pci_DevicePath[i] );
-
-            if (!access(bdata(filepath),F_OK))
-            {
-                FD[j][i] = 0;
-            }
-            else
-            {
-                FD[j][i] = -2;
-            }
-            bdestroy(filepath);
-        }
-    }
-
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        if(geteuid() != 0)
-        {
-            fprintf(stderr, "WARNING\n");
-            fprintf(stderr, "Direct access to the PCI Cfg Adressspace is only allowed for uid root!\n");
-            fprintf(stderr, "This means you can use performance groups as MEM only as root in direct mode.\n");
-            fprintf(stderr, "Alternatively you might want to look into (sys)daemonmode.\n\n");
-        }
-    }
-    else /* daemon or sysdaemon-mode */
-    {
-        socket_fd = initSocket_fd;
-    }
-}
-
-
-void
-pci_finalize()
-{
-    for (int j=0; j<socket_count; j++)
-    {
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
-        {
-            if (FD[j][i] > 0)
-            {
-                close(FD[j][i]);
-            }
-        }
-    }
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
-    {
-        socket_fd = -1;
-    }
-}
-
-
-uint32_t
-pci_read(int cpu, PciDeviceIndex device, uint32_t reg)
-{
-    int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
-    {
-        fprintf(stderr, "Trying to access non-existent PCI device (%s) for reading\n", pci_DevicePath[device]);
-        return 0;
-    }
-
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        uint32_t data = 0;
-        if ( !FD[socketId][device] )
-        {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
-            if ( FD[socketId][device] < 0)
-            {
-                fprintf(stderr, "ERROR in pci_read: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
-            }
-            bdestroy(filepath);
-        }
-
-        if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
-        {
-            ERROR_PRINT("ERROR in pci_read: failed on CPU %d Register 0x%x", cpu, reg);
-        }
-
-        return data;
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        return (uint32_t) accessClient_read(socket_fd, socketId, device, reg);
-    }
-}
-
-
-
-void
-pci_write(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
-{
-    int socketId = affinity_core2node_lookup[cpu];
-
-    if ( FD[socketId][device] == -2)
-    {
-        fprintf(stderr, "Trying to access non-existent PCI device (%s) for writing\n", pci_DevicePath[device]);
-        return;
-    }
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        if ( !FD[socketId][device] )
-        {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
-            if ( FD[socketId][device] < 0)
-            {
-                fprintf(stderr, "ERROR in pci_write: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
-            }
-            bdestroy(filepath);
-        }
-
-        if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
-        {
-            ERROR_PRINT("ERROR in pci_write: failed on CPU %d Register 0x%x", cpu, reg);
-        }
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        accessClient_write(socket_fd, socketId, device, reg, (uint64_t) data);
-    }
-}
-
-uint32_t
-pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg)
-{
-    int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
-    {
-        return 0;
-    }
-
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        uint32_t data = 0;
-        if ( !FD[socketId][device] )
-        {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
-            if ( FD[socketId][device] < 0)
-            {
-                fprintf(stderr, "ERROR in pci_tread:\n    failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
-            }
-            bdestroy(filepath);
-        }
-
-        if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
-        {
-            ERROR_PRINT("ERROR in pci_tread: failed on CPU %d Register 0x%x", cpu, reg);
-        }
-
-        return data;
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        return accessClient_read(tsocket_fd, socketId, device, reg);
-    }
-}
-
-void
-pci_twrite( const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
-{
-    int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
-    {
-        return;
-    }
-    if (accessClient_mode == DAEMON_AM_DIRECT)
-    {
-        if ( !FD[socketId][device] )
-        {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
-
-            if ( FD[socketId][device] < 0)
-            {
-                fprintf(stderr, "ERROR in pci_twrite: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
-            }
-            bdestroy(filepath);
-        }
-
-        if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
-        {
-            ERROR_PRINT("ERROR in pci_twrite: failed on CPU %d Register 0x%x", cpu, reg);
-        }
-    }
-    else
-    { /* daemon or sysdaemon-mode */
-        accessClient_write(tsocket_fd, socketId, device, reg, data);
-    }
-}
-
-
-
diff --git a/src/pci_hwloc.c b/src/pci_hwloc.c
new file mode 100644
index 0000000..217e447
--- /dev/null
+++ b/src/pci_hwloc.c
@@ -0,0 +1,81 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_hwloc.c
+ *
+ *      Description:  Interface to hwloc for PCI device lookup
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+
+#include <hwloc.h>
+#include <types.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <topology_hwloc.h>
+#include <error.h>
+
+int 
+hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+    int cntr = 0;
+    uint16_t testVendor = 0x8086;
+    hwloc_obj_t obj;
+    int flags;
+    int i;
+
+    if (!hwloc_topology)
+    {
+        likwid_hwloc_topology_init(&hwloc_topology);
+        likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+        likwid_hwloc_topology_load(hwloc_topology);
+    }
+
+    for(i = 0; i < likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE); i++)
+    {
+        obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE, i);
+        if (obj->attr->pcidev.vendor_id != testVendor)
+        {
+            continue;
+        }
+        if ((obj->attr->pcidev.vendor_id == testVendor) && (obj->attr->pcidev.device_id == testDevice))
+        {
+            socket_bus[cntr] = (char*)malloc(4);
+            sprintf(socket_bus[cntr++], "%02x/", obj->attr->pcidev.bus);
+        }
+    }
+    *nrSockets = cntr;
+
+    if (cntr == 0)
+    {
+        return -ENODEV;
+    }
+
+    return 0;
+}
diff --git a/src/pci_proc.c b/src/pci_proc.c
new file mode 100644
index 0000000..cee436f
--- /dev/null
+++ b/src/pci_proc.c
@@ -0,0 +1,125 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_proc.c
+ *
+ *      Description:  Interface to procfs/sysfs for PCI device lookup
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+
+
+#include <types.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <error.h>
+
+int getBusFromSocket(const uint32_t socket)
+{
+    int cur_bus = 0;
+    uint32_t cur_socket = 0;
+    char pci_filepath[1024];
+    int fp;
+    int ret = 0;
+    while(cur_socket <= socket)
+    {
+        sprintf(pci_filepath, "/proc/bus/pci/%02x/05.0", cur_bus);
+        fp = open(pci_filepath, O_RDONLY);
+        if (fp < 0)
+        {
+            return -1;
+        }
+        uint32_t cpubusno = 0;
+        ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+        if (ret != sizeof(uint32_t))
+        {
+            close(fp);
+            return -1;
+        }
+        cur_bus = (cpubusno >> 8) & 0x0ff;
+        close(fp);
+        if(socket == cur_socket)
+            return cur_bus;
+        ++cur_socket;
+        ++cur_bus;
+        if(cur_bus > 0x0ff)
+           return -1;
+    }
+
+    return -1;
+}
+
+int
+proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+    FILE *fptr;
+    char buf[1024];
+    int cntr = 0;
+    uint16_t testVendor = 0x8086;
+    uint32_t sbus, sdevfn, svend, sdev;
+    int busID;
+    
+
+    if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
+    {
+        fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
+                Thus, no support for PCI based Uncore counters.\n");
+        return -ENODEV;
+    }
+
+    while( fgets(buf, sizeof(buf)-1, fptr) )
+    {
+        if ( sscanf(buf, "%2x%2x %4x%4x", &sbus, &sdevfn, &svend, &sdev) == 4 &&
+             svend == testVendor && sdev == testDevice )
+        {
+            socket_bus[cntr] = (char*)malloc(4);
+            busID = getBusFromSocket(cntr);
+            if (busID == sbus)
+            {
+                sprintf(socket_bus[cntr], "%02x/", sbus);
+            }
+            else
+            {
+                sprintf(socket_bus[cntr], "%02x/", busID);
+            }
+            cntr++;
+        }
+    }
+    fclose(fptr);
+    
+    *nrSockets = cntr;
+    
+    if ( cntr == 0 )
+    {
+        //fprintf(stderr, "Uncore not supported on this system\n");
+        return -ENODEV;
+    }
+    
+    return 0;
+}
diff --git a/src/perfgroup.c b/src/perfgroup.c
new file mode 100644
index 0000000..166790e
--- /dev/null
+++ b/src/perfgroup.c
@@ -0,0 +1,1285 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfgroup.c
+ *
+ *      Description:  Handler for performance groups and event sets
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include <error.h>
+#include <perfgroup.h>
+#include <calculator.h>
+#include <likwid.h>
+
+int isdir(char* dirname)
+{
+    struct stat st;
+    if (NULL == dirname) {
+        return 0;
+    }
+    if (access(dirname, R_OK) != 0)
+        return 0;
+    stat(dirname, &st);
+    return S_ISDIR(st.st_mode) ? 1 : 0;
+}
+
+int get_groups(char* grouppath, char* architecture, char*** groupnames, char*** groupshort, char*** grouplong)
+{
+    int i = 0, j = 0, s = 0;
+    int fsize = 0, hsize = 0;
+    DIR *dp = NULL;
+    FILE* fp = NULL;
+    char buf[256] = { [0 ... 255] = '\0' };
+    struct dirent *ep = NULL;
+    *groupnames = NULL;
+    *groupshort = NULL;
+    *grouplong = NULL;
+    int search_home = 0;
+    bstring SHORT = bformat("SHORT");
+    bstring LONG = bformat("LONG");
+    int read_long = 0;
+    if ((grouppath == NULL)||(architecture == NULL)||(groupnames == NULL))
+        return -EINVAL;
+    char* fullpath = malloc((strlen(grouppath)+strlen(architecture)+50) * sizeof(char));
+    if (fullpath == NULL)
+    {
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -ENOMEM;
+    }
+    char* homepath = malloc((strlen(getenv("HOME"))+strlen(architecture)+50) * sizeof(char));
+    if (homepath == NULL)
+    {
+        free(fullpath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -ENOMEM;
+    }
+    fsize = sprintf(fullpath, "%s/%s", grouppath, architecture);
+    if (isdir(fullpath))
+    {
+        dp = opendir(fullpath);
+        if (dp == NULL)
+        {
+            printf("Cannot open directory %s\n", fullpath);
+            free(fullpath);
+            free(homepath);
+            bdestroy(SHORT);
+            bdestroy(LONG);
+            return -EACCES;
+        }
+    }
+    else
+    {
+        printf("Cannot access directory %s\n", fullpath);
+        free(fullpath);
+        free(homepath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -EACCES;
+    }
+    i = 0;
+    s = 0;
+    while (ep = readdir(dp))
+    {
+        if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+        {
+            i++;
+            if (strlen(ep->d_name)-4 > s)
+                s = strlen(ep->d_name)-4;
+        }
+    }
+    closedir(dp);
+    hsize = sprintf(homepath, "%s/.likwid/groups/%s", getenv("HOME"), architecture);
+    if (isdir(homepath))
+    {
+        search_home = 1;
+        dp = opendir(homepath);
+        if (dp == NULL)
+        {
+            search_home = 0;
+        }
+        if (search_home)
+        {
+            while (ep = readdir(dp))
+            {
+                if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+                {
+                    i++;
+                    if (strlen(ep->d_name)-4 > s)
+                        s = strlen(ep->d_name)-4;
+                }
+            }
+            closedir(dp);
+        }
+    }
+
+    *groupnames = malloc(i * sizeof(char**));
+    if (*groupnames == NULL)
+    {
+        free(fullpath);
+        free(homepath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -ENOMEM;
+    }
+    *groupshort = malloc(i * sizeof(char**));
+    if (*groupshort == NULL)
+    {
+        free(*groupnames);
+        *groupnames = NULL;
+        free(fullpath);
+        free(homepath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -ENOMEM;
+    }
+    *grouplong = malloc(i * sizeof(char**));
+    if (*grouplong == NULL)
+    {
+        free(*groupnames);
+        *groupnames = NULL;
+        free(*groupshort);
+        *groupshort = NULL;
+        free(fullpath);
+        free(homepath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return -ENOMEM;
+    }
+    for (j=0; j < i; j++)
+    {
+        (*grouplong)[i] == NULL;
+        (*groupshort)[i] == NULL;
+        (*groupnames)[j] = malloc((s+1) * sizeof(char));
+        if ((*groupnames)[j] == NULL)
+        {
+            free(*groupnames);
+            *groupnames = NULL;
+            free(*groupshort);
+            *groupshort = NULL;
+            free(*grouplong);
+            *grouplong = NULL;
+            free(fullpath);
+            free(homepath);
+            bdestroy(SHORT);
+            bdestroy(LONG);
+            return -ENOMEM;
+        }
+    }
+    dp = opendir(fullpath);
+    i = 0;
+    
+    while (ep = readdir(dp))
+    {
+        if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+        {
+            read_long = 0;
+            bstring long_info = bfromcstr("");;
+            sprintf(&(fullpath[fsize]), "/%s", ep->d_name);
+            if (!access(fullpath, R_OK))
+            {
+                (*grouplong)[i] = NULL;
+                s = sprintf((*groupnames)[i], "%.*s", (int)(strlen(ep->d_name)-4), ep->d_name);
+                (*groupnames)[i][s] = '\0';
+                fp = fopen(fullpath,"r");
+                
+                while (fgets (buf, sizeof(buf), fp)) {
+                    bstring bbuf = bfromcstr(buf);
+                    btrimws(bbuf);
+                    if ((blength(bbuf) == 0) || (buf[0] == '#'))
+                    {
+                        bdestroy(bbuf);
+                        continue;
+                    }
+                    if (bstrncmp(bbuf, SHORT, 5) == 0)
+                    {
+                        struct bstrList * linelist = bsplit(bbuf, ' ');
+                        bstring sinfo;
+                        if (linelist->qty == 1)
+                        {
+                            fprintf(stderr,"Cannot read SHORT section in groupfile %s",fullpath);
+                            bdestroy(bbuf);
+                            bstrListDestroy(linelist);
+                            continue;
+                        }
+                        s = 1;
+                        for (j=s;j<linelist->qty; j++)
+                        {
+                            btrimws(linelist->entry[j]);
+                            if (blength(linelist->entry[j]) == 0)
+                                s += 1;
+                            else
+                                break;
+                        }
+                        btrimws(linelist->entry[s]);
+                        sinfo = bformat("%s", bdata(linelist->entry[s]));
+                        for (j=s+1;j<linelist->qty; j++)
+                        {
+                            btrimws(linelist->entry[j]);
+                            bstring tmp = bformat(" %s", bdata(linelist->entry[j]));
+                            bconcat(sinfo, tmp);
+                            bdestroy(tmp);
+                        }
+
+                        (*groupshort)[i] = malloc((blength(sinfo)+1) * sizeof(char));
+                        if ((*groupshort)[i] == NULL)
+                        {
+                            bdestroy(SHORT);
+                            bdestroy(LONG);
+                            bdestroy(bbuf);
+                            bdestroy(sinfo);
+                            free(homepath);
+                            free(fullpath);
+                            bstrListDestroy(linelist);
+                            return -ENOMEM;
+                        }
+                        s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
+                        (*groupshort)[i][s] = '\0';
+                        bstrListDestroy(linelist);
+                        bdestroy(sinfo);
+                    }
+                    else if (bstrncmp(bbuf, LONG, 4) == 0)
+                    {
+                        read_long = 1;
+                    }
+                    else if ((read_long == 1) && (bstrncmp(bbuf, LONG, 4) != 0))
+                    {
+                        bstring tmp = bfromcstr(buf);
+                        bconcat(long_info, tmp);
+                        bdestroy(tmp);
+                    }
+                    bdestroy(bbuf);
+                }
+                if (read_long)
+                {
+                    (*grouplong)[i] = malloc((blength(long_info) + 1) * sizeof(char) );
+                    if ((*grouplong)[i] != NULL)
+                    {
+                        j = sprintf((*grouplong)[i], "%s", bdata(long_info));
+                        (*grouplong)[i][j] = '\0';
+                    }
+                }
+                fclose(fp);
+                
+                i++;
+            }
+            bdestroy(long_info);
+        }
+    }
+    closedir(dp);
+    if (!search_home)
+    {
+        free(homepath);
+        free(fullpath);
+        bdestroy(SHORT);
+        bdestroy(LONG);
+        return i;
+    }
+    else
+    {
+        dp = opendir(homepath);
+        while (ep = readdir(dp))
+        {
+            if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".txt", 4) == 0)
+            {
+                read_long = 0;
+                bstring long_info = bfromcstr("");;
+                sprintf(&(homepath[hsize]), "/%s", ep->d_name);
+                if (!access(homepath, R_OK))
+                {
+                    (*grouplong)[i] = NULL;
+                    s = sprintf((*groupnames)[i], "%.*s", (int)(strlen(ep->d_name)-4), ep->d_name);
+                    (*groupnames)[i][s] = '\0';
+                    fp = fopen(homepath,"r");
+                    while (fgets (buf, sizeof(buf), fp)) {
+                        
+                        bstring bbuf = bfromcstr(buf);
+                        btrimws(bbuf);
+                        if ((blength(bbuf) == 0) || (buf[0] == '#'))
+                        {
+                            bdestroy(bbuf);
+                            continue;
+                        }
+                        if (bstrncmp(bbuf, SHORT, 5) == 0)
+                        {
+                            struct bstrList * linelist = bsplit(bbuf, ' ');
+                            bstring sinfo;
+                            if (linelist->qty == 1)
+                            {
+                                fprintf(stderr,"Cannot read SHORT section in groupfile %s",fullpath);
+                                bdestroy(bbuf);
+                                bstrListDestroy(linelist);
+                                continue;
+                            }
+                            s = 1;
+                            for (j=s;j<linelist->qty; j++)
+                            {
+                                btrimws(linelist->entry[j]);
+                                if (blength(linelist->entry[j]) == 0)
+                                    s += 1;
+                                else
+                                    break;
+                            }
+                            btrimws(linelist->entry[s]);
+                            sinfo = bformat("%s", bdata(linelist->entry[s]));
+                            for (j=s+1;j<linelist->qty; j++)
+                            {
+                                btrimws(linelist->entry[j]);
+                                bstring tmp = bformat(" %s", bdata(linelist->entry[j]));
+                                bconcat(sinfo, tmp);
+                                bdestroy(tmp);
+                            }
+
+                            (*groupshort)[i] = malloc((blength(sinfo)+1) * sizeof(char));
+                            if ((*groupshort)[i] == NULL)
+                            {
+                                bdestroy(SHORT);
+                                bdestroy(LONG);
+                                bdestroy(bbuf);
+                                bdestroy(sinfo);
+                                free(homepath);
+                                free(fullpath);
+                                bstrListDestroy(linelist);
+                                return -ENOMEM;
+                            }
+                            s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
+                            (*groupshort)[i][s] = '\0';
+                            bstrListDestroy(linelist);
+                            bdestroy(sinfo);
+                        }
+                        else if (bstrncmp(bbuf, LONG, 4) == 0)
+                        {
+                            read_long = 1;
+                        }
+                        else if ((read_long == 1) && (bstrncmp(bbuf, LONG, 4) != 0))
+                        {
+                            bstring tmp = bfromcstr(buf);
+                            bconcat(long_info, tmp);
+                            bdestroy(tmp);
+                        }
+                        bdestroy(bbuf);
+                    }
+                    if (read_long)
+                    {
+                        (*grouplong)[i] = malloc((blength(long_info) + 1) * sizeof(char) );
+                        if ((*grouplong)[i] != NULL)
+                        {
+                            j = sprintf((*grouplong)[i], "%s", bdata(long_info));
+                            (*grouplong)[i][j] = '\0';
+                        }
+                    }
+                    fclose(fp);
+                    i++;
+                }
+                bdestroy(long_info);
+            }
+        }
+        closedir(dp);
+    }
+    bdestroy(SHORT);
+    bdestroy(LONG);
+    free(fullpath);
+    free(homepath);
+    return i;
+}
+
+void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong)
+{
+    int i;
+    for (i = 0; i <groups; i++)
+    {
+        if (groupnames[i])
+            free(groupnames[i]);
+        if (groupshort[i])
+            free(groupshort[i]);
+        if (grouplong[i])
+            free(grouplong[i]);
+    }
+    if (groupnames)
+        free(groupnames);
+    if (groupshort)
+        free(groupshort);
+    if (grouplong)
+        free(grouplong);
+}
+
+
+
+int custom_group(char* eventStr, GroupInfo* ginfo)
+{
+    int i, j;
+    int err = 0;
+    char delim = ',';
+    bstring edelim = bformat(":");
+    int has_fix0 = 0;
+    int has_fix1 = 0;
+    int has_fix2 = 0;
+    ginfo->shortinfo = NULL;
+    ginfo->nevents = 0;
+    ginfo->events = NULL;
+    ginfo->counters = NULL;
+    ginfo->nmetrics = 0;
+    ginfo->metricformulas = NULL;
+    ginfo->metricnames = NULL;
+    ginfo->longinfo = NULL;
+    bstring eventBstr;
+    struct bstrList * eventList;
+    bstring fix0 = bformat("FIXC0");
+    bstring fix1 = bformat("FIXC1");
+    bstring fix2 = bformat("FIXC2");
+    DEBUG_PRINT(DEBUGLEV_INFO, Creating custom group for event string %s, eventStr);
+    
+    ginfo->shortinfo = malloc(7 * sizeof(char));
+    if (ginfo->shortinfo == NULL)
+    {
+        err = -ENOMEM;
+        goto cleanup;
+    }
+    sprintf(ginfo->shortinfo, "%s", "Custom");
+    ginfo->longinfo = malloc(7 * sizeof(char));
+    if (ginfo->longinfo == NULL)
+    {
+        err = -ENOMEM;
+        goto cleanup;
+    }
+    sprintf(ginfo->longinfo, "%s", "Custom");
+    ginfo->groupname = malloc(7 * sizeof(char));
+    if (ginfo->groupname == NULL)
+    {
+        err = -ENOMEM;
+        goto cleanup;
+    }
+    sprintf(ginfo->groupname, "%s", "Custom");
+    
+    eventBstr = bfromcstr(eventStr);
+    eventList = bsplit(eventBstr, delim);
+    ginfo->nevents = eventList->qty;
+
+    if (binstr(eventBstr, 0, fix0) > 0)
+    {
+        has_fix0 = 1;
+    }
+    else
+    {
+        ginfo->nevents++;
+    }
+    if (binstr(eventBstr, 0, fix1) > 0)
+    {
+        has_fix1 = 1;
+    }
+    else
+    {
+        ginfo->nevents++;
+    }
+    if (binstr(eventBstr, 0, fix2) > 0)
+    {
+        has_fix2 = 1;
+    }
+    else
+    {
+        ginfo->nevents++;
+    }
+    bdestroy(eventBstr);
+
+    ginfo->events = malloc(ginfo->nevents * sizeof(char*));
+    if (ginfo->events == NULL)
+    {
+        err = -ENOMEM;
+        bstrListDestroy(eventList);
+        goto cleanup;
+    }
+    ginfo->counters = malloc(ginfo->nevents * sizeof(char*));
+    if (ginfo->counters == NULL)
+    {
+        err = -ENOMEM;
+        bstrListDestroy(eventList);
+        goto cleanup;
+    }
+    for (i = 0; i< eventList->qty; i++)
+    {
+        int s;
+        struct bstrList * elist;
+        elist = bsplit(eventList->entry[i], ':');
+        ginfo->events[i] = malloc((blength(elist->entry[0]) + 1) * sizeof(char));
+        if (ginfo->events[i] == NULL)
+        {
+            bstrListDestroy(elist);
+            err = -ENOMEM;
+            goto cleanup;
+        }
+        bstring ctr = bstrcpy(elist->entry[1]);
+        if (elist->qty > 2)
+        {
+            for (j = 2; j < elist->qty; j++)
+            {
+                bconcat(ctr, edelim);
+                bconcat(ctr, elist->entry[j]);
+            }
+        }
+        ginfo->counters[i] = malloc((blength(ctr) + 1) * sizeof(char));
+        if (ginfo->counters[i] == NULL)
+        {
+            bstrListDestroy(elist);
+            bdestroy(ctr);
+            err = -ENOMEM;
+            goto cleanup;
+        }
+        sprintf(ginfo->events[i], "%s", bdata(elist->entry[0]));
+        snprintf(ginfo->counters[i], blength(ctr)+1, "%s", bdata(ctr));
+        bdestroy(ctr);
+        bstrListDestroy(elist);
+    }
+    i = eventList->qty;
+    if (!has_fix0)
+    {
+        ginfo->events[i] = malloc(18 * sizeof(char));
+        ginfo->counters[i] = malloc(6 * sizeof(char));
+        sprintf(ginfo->events[i], "%s", "INSTR_RETIRED_ANY");
+        sprintf(ginfo->counters[i], "%s", "FIXC0");
+        i++;
+    }
+    if (!has_fix1)
+    {
+        ginfo->events[i] = malloc(22 * sizeof(char));
+        ginfo->counters[i] = malloc(6 * sizeof(char));
+        sprintf(ginfo->events[i], "%s", "CPU_CLK_UNHALTED_CORE");
+        sprintf(ginfo->counters[i], "%s", "FIXC1");
+        i++;
+    }
+    if (!has_fix2)
+    {
+        ginfo->events[i] = malloc(21 * sizeof(char));
+        ginfo->counters[i] = malloc(6 * sizeof(char));
+        sprintf(ginfo->events[i], "%s", "CPU_CLK_UNHALTED_REF");
+        sprintf(ginfo->counters[i], "%s", "FIXC2");
+        i++;
+    }
+
+    bstrListDestroy(eventList);
+    bdestroy(fix0);
+    bdestroy(fix1);
+    bdestroy(fix2);
+    bdestroy(edelim);
+    return 0;
+cleanup:
+    bstrListDestroy(eventList);
+    bdestroy(fix0);
+    bdestroy(fix1);
+    bdestroy(fix2);
+    bdestroy(edelim);
+    if (ginfo->shortinfo != NULL)
+        free(ginfo->shortinfo);
+    if (ginfo->events != NULL)
+        free(ginfo->events);
+    if (ginfo->counters != NULL)
+        free(ginfo->counters);
+    return err;
+}
+
+int read_group(char* grouppath, char* architecture, char* groupname, GroupInfo* ginfo)
+{
+    FILE* fp;
+    int i, s, e, err = 0;
+    char buf[512];
+    GroupFileSections sec = GROUP_NONE;
+    if ((grouppath == NULL)||(architecture == NULL)||(groupname == NULL)||(ginfo == NULL))
+        return -EINVAL;
+
+    bstring fullpath = bformat("%s/%s/%s.txt", grouppath,architecture, groupname);
+    bstring homepath = bformat("%s/.likwid/groups/%s/%s.txt", getenv("HOME"),architecture, groupname);
+
+    if (access(bdata(fullpath), R_OK))
+    {
+        DEBUG_PRINT(DEBUGLEV_INFO, Cannot read group file %s. Trying %s, bdata(fullpath), bdata(homepath));
+        if (access(bdata(homepath), R_OK))
+        {
+            ERROR_PRINT(Cannot read group file %s.txt. Searched in %s and %s, groupname, bdata(fullpath), bdata(homepath));
+            bdestroy(fullpath);
+            bdestroy(homepath);
+            return -EACCES;
+        }
+        else
+        {
+            fullpath = bstrcpy(homepath);
+        }
+    }
+
+    DEBUG_PRINT(DEBUGLEV_INFO, Reading group %s from %s, groupname, bdata(fullpath));
+
+    ginfo->shortinfo = NULL;
+    ginfo->nevents = 0;
+    ginfo->events = NULL;
+    ginfo->counters = NULL;
+    ginfo->nmetrics = 0;
+    ginfo->metricformulas = NULL;
+    ginfo->metricnames = NULL;
+    ginfo->longinfo = NULL;
+    ginfo->groupname = (char*)malloc((strlen(groupname)+10)*sizeof(char));
+    if (ginfo->groupname == NULL)
+    {
+        err = -ENOMEM;
+        goto cleanup;
+    }
+    //strncpy(ginfo->groupname, groupname, strlen(groupname));
+    i = sprintf(ginfo->groupname, "%s", groupname);
+    ginfo->groupname[i] = '\0';
+
+    fp = fopen(bdata(fullpath), "r");
+    if (fp == NULL)
+    {
+        free(ginfo->groupname);
+        bdestroy(fullpath);
+        bdestroy(homepath);
+        return -EACCES;
+    }
+    struct bstrList * linelist;
+    while (fgets (buf, sizeof(buf), fp)) {
+        if ((strlen(buf) == 0) || (buf[0] == '#'))
+            continue;
+
+        if (strncmp(groupFileSectionNames[GROUP_SHORT], buf, strlen(groupFileSectionNames[GROUP_SHORT])) == 0)
+        {
+            sec = GROUP_SHORT;
+            for (i=strlen(groupFileSectionNames[GROUP_SHORT]); i < strlen(buf); i++)
+            {
+                if (buf[i] == ' ')
+                    continue;
+                break;
+            }
+            ginfo->shortinfo = malloc(strlen(&(buf[i])) * sizeof(char));
+            sprintf(ginfo->shortinfo, "%.*s", (int)strlen(&(buf[i]))-1, &(buf[i]));
+            continue;
+        }
+        else if (strncmp(groupFileSectionNames[GROUP_EVENTSET], buf, strlen(groupFileSectionNames[GROUP_EVENTSET])) == 0)
+        {
+            sec = GROUP_EVENTSET;
+            continue;
+        }
+        else if (strncmp(groupFileSectionNames[GROUP_METRICS], buf, strlen(groupFileSectionNames[GROUP_METRICS])) == 0)
+        {
+            sec = GROUP_METRICS;
+            continue;
+        }
+        else if (strncmp(groupFileSectionNames[GROUP_LONG], buf, strlen(groupFileSectionNames[GROUP_LONG])) == 0)
+        {
+            sec = GROUP_LONG;
+            continue;
+        }
+        if (sec == GROUP_NONE)
+            continue;
+        if (sec == GROUP_EVENTSET)
+        {
+            i = 0;
+            bstring bbuf = bfromcstr(buf);
+            btrimws(bbuf);
+            if (blength(bbuf) == 0)
+            {
+                bdestroy(bbuf);
+                sec = GROUP_NONE;
+                continue;
+            }
+            linelist = bsplit(bbuf, ' ');
+            for (i=0; i<linelist->qty; i++)
+                btrimws(linelist->entry[i]);
+            bdestroy(bbuf);
+            bbuf = bstrcpy(linelist->entry[0]);
+            for (i=1; i<linelist->qty; i++)
+            {
+                if (blength(linelist->entry[i]) > 0)
+                {
+                    bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+                    bconcat(bbuf, tmp);
+                    bdestroy(tmp);
+                }
+            }
+            if (ginfo->events == NULL)
+            {
+                ginfo->events = (char**)malloc(sizeof(char*));
+                if (ginfo->events == NULL)
+                {
+                    err = -ENOMEM;
+                    bdestroy(bbuf);
+                    goto cleanup;
+                }
+            }
+            else
+            {
+                char** tmp = NULL;
+                tmp = realloc(ginfo->events, (ginfo->nevents + 1) * sizeof(char*));
+                if (tmp == NULL)
+                {
+                    free(ginfo->events);
+                    bdestroy(bbuf);
+                    err = -ENOMEM;
+                    goto cleanup;
+                }
+                else
+                {
+                    ginfo->events = tmp;
+                    tmp = NULL;
+                }
+            }
+            if (ginfo->counters == NULL)
+            {
+                ginfo->counters = (char**)malloc(sizeof(char*));
+                if (ginfo->counters == NULL)
+                {
+                    err = -ENOMEM;
+                    bdestroy(bbuf);
+                    goto cleanup;
+                }
+            }
+            else
+            {
+                char** tmp = NULL;
+                tmp = realloc(ginfo->counters, (ginfo->nevents + 1) * sizeof(char*));
+                if (tmp == NULL)
+                {
+                    free(ginfo->counters);
+                    bdestroy(bbuf);
+                    err = -ENOMEM;
+                    goto cleanup;
+                }
+                else
+                {
+                    ginfo->counters = tmp;
+                    tmp = NULL;
+                }
+            }
+            bstrListDestroy(linelist);
+            
+
+            linelist = bsplit(bbuf, ' ');
+            bdestroy(bbuf);
+            for (i=0; i<linelist->qty; i++)
+                btrimws(linelist->entry[i]);
+            ginfo->counters[ginfo->nevents] = malloc((blength(linelist->entry[0])+1) * sizeof(char));
+            if (ginfo->counters[ginfo->nevents] == NULL)
+            {
+                err = -ENOMEM;
+                goto cleanup;
+            }
+            ginfo->events[ginfo->nevents] = malloc((blength(linelist->entry[1])+1) * sizeof(char));
+            if (ginfo->events[ginfo->nevents] == NULL)
+            {
+                err = -ENOMEM;
+                goto cleanup;
+            }
+            sprintf(ginfo->counters[ginfo->nevents], "%s", bdata(linelist->entry[0]));
+            sprintf(ginfo->events[ginfo->nevents], "%s", bdata(linelist->entry[1]));
+            
+            ginfo->nevents++;
+            bstrListDestroy(linelist);
+            continue;
+        }
+        else if (sec == GROUP_METRICS)
+        {
+            i = 0;
+            bstring bbuf = bfromcstr(buf);
+            btrimws(bbuf);
+            if (blength(bbuf) == 0)
+            {
+                bdestroy(bbuf);
+                sec = GROUP_NONE;
+                continue;
+            }
+            linelist = bsplit(bbuf, ' ');
+            for (i=0; i<linelist->qty; i++)
+                btrimws(linelist->entry[i]);
+            bdestroy(bbuf);
+            bbuf = bstrcpy(linelist->entry[0]);
+            for (i=1; i<linelist->qty; i++)
+            {
+                if (blength(linelist->entry[i]) > 0)
+                {
+                    bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+                    bconcat(bbuf, tmp);
+                    bdestroy(tmp);
+                }
+            }
+            char** tmp;
+            tmp = realloc(ginfo->metricformulas, (ginfo->nmetrics + 1) * sizeof(char*));
+            if (tmp == NULL)
+            {
+                free(ginfo->metricformulas);
+                bdestroy(bbuf);
+                bstrListDestroy(linelist);
+                err = -ENOMEM;
+                goto cleanup;
+            }
+            else
+            {
+                ginfo->metricformulas = tmp;
+            }
+            tmp = realloc(ginfo->metricnames, (ginfo->nmetrics + 1) * sizeof(char*));
+            if (tmp == NULL)
+            {
+                free(ginfo->metricnames);
+                bdestroy(bbuf);
+                bstrListDestroy(linelist);
+                err = -ENOMEM;
+                goto cleanup;
+            }
+            else
+            {
+                ginfo->metricnames = tmp;
+            }
+            bstrListDestroy(linelist);
+            linelist = bsplit(bbuf, ' ');
+            ginfo->metricformulas[ginfo->nmetrics] = malloc((blength(linelist->entry[linelist->qty - 1])+1) * sizeof(char));
+            if (ginfo->metricformulas[ginfo->nmetrics] == NULL)
+            {
+                err = -ENOMEM;
+                bdestroy(bbuf);
+                bstrListDestroy(linelist);
+                goto cleanup;
+            }
+            ginfo->metricnames[ginfo->nmetrics] = malloc(((blength(bbuf)-blength(linelist->entry[linelist->qty - 1]))+1) * sizeof(char));
+            if (ginfo->metricnames[ginfo->nmetrics] == NULL)
+            {
+                err = -ENOMEM;
+                bdestroy(bbuf);
+                bstrListDestroy(linelist);
+                goto cleanup;
+            }
+            bdestroy(bbuf);
+            sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", bdata(linelist->entry[linelist->qty - 1]));
+            bbuf = bstrcpy(linelist->entry[0]);
+            for (i=1; i<linelist->qty - 1; i++)
+            {
+                if (blength(linelist->entry[i]) > 0)
+                {
+                    bstring tmp = bformat(" %s", bdata(linelist->entry[i]));
+                    bconcat(bbuf, tmp);
+                    bdestroy(tmp);
+                }
+            }
+            sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", bdata(bbuf));
+            bdestroy(bbuf);
+            bstrListDestroy(linelist);
+            ginfo->nmetrics++;
+            continue;
+        }
+        else if (sec == GROUP_LONG)
+        {
+            s = (ginfo->longinfo == NULL ? 0 : strlen(ginfo->longinfo));
+            char *tmp;
+            tmp = realloc(ginfo->longinfo, (s + strlen(buf) + 3) * sizeof(char));
+            if (tmp == NULL)
+            {
+                free(ginfo->longinfo);
+                err = -ENOMEM;
+                goto cleanup;
+            }
+            else
+            {
+                ginfo->longinfo = tmp;
+            }
+            sprintf(&(ginfo->longinfo[s]), "%.*s", (int)strlen(buf), buf);
+            continue;
+        }
+    }
+    //bstrListDestroy(linelist);
+    fclose(fp);
+    bdestroy(homepath);
+    bdestroy(fullpath);
+    return 0;
+cleanup:
+    bdestroy(homepath);
+    bdestroy(fullpath);
+    if (ginfo->groupname)
+        free(ginfo->groupname);
+    if (ginfo->shortinfo)
+        free(ginfo->shortinfo);
+    if (ginfo->longinfo)
+        free(ginfo->longinfo);
+    if (ginfo->nevents > 0)
+    {
+        for(i=0;i<ginfo->nevents; i++)
+        {
+            if (ginfo->counters[i])
+                free(ginfo->counters[i]);
+            if (ginfo->events[i])
+                free(ginfo->events[i]);
+        }
+    }
+    if (ginfo->nmetrics > 0)
+    {
+        for(i=0;i<ginfo->nmetrics; i++)
+        {
+            if (ginfo->metricformulas[i])
+                free(ginfo->metricformulas[i]);
+            if (ginfo->metricnames[i])
+                free(ginfo->metricnames[i]);
+        }
+    }
+    return err;
+}
+
+int new_group(GroupInfo* ginfo)
+{
+    if (!ginfo)
+        return -EINVAL;
+    ginfo->groupname = NULL;
+    ginfo->shortinfo = NULL;
+    ginfo->nevents = 0;
+    ginfo->events = NULL;
+    ginfo->counters = NULL;
+    ginfo->nmetrics = 0;
+    ginfo->metricformulas = NULL;
+    ginfo->metricnames = NULL;
+    ginfo->longinfo = NULL;
+    return 0;
+}
+
+char* get_eventStr(GroupInfo* ginfo)
+{
+    int i;
+    char* string;
+    int size = 0;
+    if (!ginfo)
+        return NULL;
+    if (ginfo->nevents == 0)
+        return NULL;
+    for(i=0;i<ginfo->nevents-1; i++)
+    {
+        size += strlen(ginfo->events[i]) + strlen(ginfo->counters[i]) + 2;
+    }
+    size += strlen(ginfo->events[ginfo->nevents-1]) + strlen(ginfo->counters[ginfo->nevents-1]) + 1 + 1;
+    size++;
+    string = malloc(size * sizeof(char));
+    if (string == NULL)
+        return NULL;
+    size = 0;
+    for(i=0;i<ginfo->nevents-1; i++)
+    {
+        size += sprintf(&(string[size]), "%s:%s,", ginfo->events[i], ginfo->counters[i]);
+    }
+    size += sprintf(&(string[size]), "%s:%s", ginfo->events[ginfo->nevents-1], ginfo->counters[ginfo->nevents-1]);
+    string[size] = '\0';
+    return string;
+}
+
+void put_eventStr(char* eventset)
+{
+    if (eventset != NULL)
+    {
+        free(eventset);
+        eventset = NULL;
+    }
+}
+
+int add_event(GroupInfo* ginfo, char* event, char* counter)
+{
+    if ((!ginfo) || (!event) || (!counter))
+        return -EINVAL;
+    ginfo->events = realloc(ginfo->events, (ginfo->nevents + 1) * sizeof(char*));
+    if (!ginfo->events)
+        return -ENOMEM;
+    ginfo->counters = realloc(ginfo->counters, (ginfo->nevents + 1) * sizeof(char*));
+    if (!ginfo->counters)
+        return -ENOMEM;
+    ginfo->events[ginfo->nevents] = malloc((strlen(event) + 1) * sizeof(char));
+    if (!ginfo->events[ginfo->nevents])
+        return -ENOMEM;
+    ginfo->counters[ginfo->nevents] = malloc((strlen(counter) + 1) * sizeof(char));
+    if (!ginfo->counters[ginfo->nevents])
+        return -ENOMEM;
+    sprintf(ginfo->events[ginfo->nevents], "%s", event);
+    sprintf(ginfo->counters[ginfo->nevents], "%s", counter);
+    ginfo->nevents++;
+    return 0;
+}
+
+int add_metric(GroupInfo* ginfo, char* mname, char* mcalc)
+{
+    if ((!ginfo) || (!mname) || (!mcalc))
+        return -EINVAL;
+    ginfo->metricnames = realloc(ginfo->metricnames, (ginfo->nmetrics + 1) * sizeof(char*));
+    if (!ginfo->metricnames)
+        return -ENOMEM;
+    ginfo->metricformulas = realloc(ginfo->metricformulas, (ginfo->nmetrics + 1) * sizeof(char*));
+    if (!ginfo->metricformulas)
+        return -ENOMEM;
+    ginfo->metricnames[ginfo->nmetrics] = malloc((strlen(mname) + 1) * sizeof(char));
+    if (!ginfo->metricnames[ginfo->nmetrics])
+        return -ENOMEM;
+    ginfo->metricformulas[ginfo->nmetrics] = malloc((strlen(mcalc) + 1) * sizeof(char));
+    if (!ginfo->metricformulas[ginfo->nmetrics])
+        return -ENOMEM;
+    sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", mname);
+    sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", mcalc);
+    ginfo->nmetrics++;
+    return 0;
+}
+
+
+char* get_groupName(GroupInfo* ginfo)
+{
+    if ((ginfo != NULL) && (ginfo->groupname != NULL))
+    {
+        int size = strlen(ginfo->groupname)+1;
+        char* gstr = malloc(size * sizeof(char));
+        sprintf(gstr, "%s", ginfo->groupname);
+        return gstr;
+    }
+    return NULL;
+}
+
+int set_groupName(GroupInfo* ginfo, char* groupName)
+{
+    if ((ginfo == NULL) || (groupName == NULL))
+        return -EINVAL;
+    int size = strlen(groupName)+1;
+    ginfo->groupname = realloc(ginfo->groupname, size * sizeof(char));
+    if (ginfo->groupname == NULL)
+        return -ENOMEM;
+    sprintf(ginfo->groupname, "%s", groupName);
+    return 0;
+}
+
+char* get_shortInfo(GroupInfo* ginfo)
+{
+    if ((ginfo != NULL) && (ginfo->shortinfo != NULL))
+    {
+        int size = strlen(ginfo->shortinfo)+1;
+        char* sstr = malloc(size * sizeof(char));
+        sprintf(sstr, "%s", ginfo->shortinfo);
+        return sstr;
+    }
+    return NULL;
+}
+
+void put_shortInfo(char* sinfo)
+{
+    if (sinfo != NULL)
+    {
+        free(sinfo);
+        sinfo = NULL;
+    }
+}
+
+int set_shortInfo(GroupInfo* ginfo, char* shortInfo)
+{
+    if ((ginfo == NULL) || (shortInfo == NULL))
+        return -EINVAL;
+    int size = strlen(shortInfo)+1;
+    ginfo->shortinfo = realloc(ginfo->shortinfo, size * sizeof(char));
+    if (ginfo->shortinfo == NULL)
+        return -ENOMEM;
+    sprintf(ginfo->shortinfo, "%s", shortInfo);
+    return 0;
+}
+
+char* get_longInfo(GroupInfo* ginfo)
+{
+    if ((ginfo != NULL) && (ginfo->longinfo != NULL))
+    {
+        int size = strlen(ginfo->longinfo)+1;
+        char* lstr = malloc(size * sizeof(char));
+        sprintf(lstr, "%s", ginfo->longinfo);
+        return lstr;
+    }
+    return NULL;
+}
+
+void put_longInfo(char* linfo)
+{
+    if (linfo != NULL)
+    {
+        free(linfo);
+        linfo = NULL;
+    }
+}
+
+int set_longInfo(GroupInfo* ginfo, char* longInfo)
+{
+    if ((ginfo == NULL) || (longInfo == NULL))
+        return -EINVAL;
+    int size = strlen(longInfo)+1;
+    ginfo->longinfo = realloc(ginfo->longinfo, size * sizeof(char));
+    if (ginfo->longinfo == NULL)
+        return -ENOMEM;
+    sprintf(ginfo->longinfo, "%s", longInfo);
+    return 0;
+}
+
+void return_group(GroupInfo* ginfo)
+{
+    int i;
+    if (ginfo->groupname)
+        free(ginfo->groupname);
+    if (ginfo->shortinfo)
+        free(ginfo->shortinfo);
+    if (ginfo->longinfo)
+        free(ginfo->longinfo);
+    if (ginfo->nevents > 0)
+    {
+        for(i=0;i<ginfo->nevents; i++)
+        {
+            if (ginfo->counters[i])
+                free(ginfo->counters[i]);
+            if (ginfo->events[i])
+                free(ginfo->events[i]);
+        }
+        free(ginfo->counters);
+        free(ginfo->events);
+    }
+    if (ginfo->nmetrics > 0)
+    {
+        for(i=0;i<ginfo->nmetrics; i++)
+        {
+            if (ginfo->metricformulas[i])
+                free(ginfo->metricformulas[i]);
+            if (ginfo->metricnames[i])
+                free(ginfo->metricnames[i]);
+        }
+        free(ginfo->metricformulas);
+        free(ginfo->metricnames);
+    }
+    ginfo->groupname = NULL;
+    ginfo->shortinfo = NULL;
+    ginfo->longinfo = NULL;
+    ginfo->counters = NULL;
+    ginfo->events = NULL;
+    ginfo->metricformulas = NULL;
+    ginfo->metricnames = NULL;
+    ginfo->nevents = 0;
+    ginfo->nmetrics = 0;
+}
+
+void init_clist(CounterList* clist)
+{
+    clist->counters = 0;
+    clist->cnames = NULL;
+    clist->cvalues = NULL;
+}
+
+int add_to_clist(CounterList* clist, char* counter, double result)
+{
+    char** tmpnames;
+    double* tmpvalues;
+    if ((clist == NULL)||(counter == NULL))
+        return -EINVAL;
+    tmpnames = realloc(clist->cnames, (clist->counters + 1) * sizeof(char*));
+    if (tmpnames == NULL)
+    {
+        return -ENOMEM;
+    }
+    clist->cnames = tmpnames;
+    tmpvalues = realloc(clist->cvalues, (clist->counters + 1) * sizeof(double));
+    if (tmpvalues == NULL)
+    {
+        return -ENOMEM;
+    }
+    clist->cvalues = tmpvalues;
+    clist->cnames[clist->counters] = malloc((strlen(counter)+2)*sizeof(char));
+    if (clist->cnames[clist->counters] == NULL)
+    {
+        return -ENOMEM;
+    }
+    sprintf(clist->cnames[clist->counters],"%s", counter);
+    clist->cvalues[clist->counters] = result;
+    clist->counters++;
+    return 0;
+}
+
+void destroy_clist(CounterList* clist)
+{
+    int i;
+    if (clist != NULL)
+    {
+        for (i=0;i<clist->counters;i++)
+        {
+            free(clist->cnames[i]);
+        }
+        free(clist->cnames);
+        free(clist->cvalues);
+    }
+}
+
+
+int calc_metric(char* formula, CounterList* clist, double *result)
+{
+    int i=0;
+    *result = 0.0;
+    int fail = 0;
+    int maxstrlen = 0, minstrlen = 10000;
+
+    if ((formula == NULL) || (clist == NULL))
+        return -EINVAL;
+
+    bstring f = bfromcstr(formula);
+    for(i=0;i<clist->counters;i++)
+    {
+        if (strlen(clist->cnames[i]) > maxstrlen)
+            maxstrlen = strlen(clist->cnames[i]);
+        if (strlen(clist->cnames[i]) < minstrlen)
+            minstrlen = strlen(clist->cnames[i]);
+    }
+
+    // try to replace each counter name in clist
+    while (maxstrlen >= minstrlen)
+    {
+        for(i=0;i<clist->counters;i++)
+        {
+            if (strlen(clist->cnames[i]) != maxstrlen)
+                continue;
+            // if we find the counter name, replace it with the value
+            bstring c = bfromcstr(clist->cnames[i]);
+            bstring v = bformat("%.20f", clist->cvalues[i]);
+            bfindreplace(f, c, v, 0);
+            bdestroy(c);
+            bdestroy(v);
+        }
+        maxstrlen--;
+    }
+    bstring test = bfromcstr("aAbBcCdDfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ,_:;!'§$&=?°´`#<>");
+    if (binchr(f, 0, test) != BSTR_ERR)
+    {
+        fprintf(stderr, "Not all counter names in formula can be substituted\n");
+        fprintf(stderr, "%s\n", bdata(f));
+        i = -EINVAL;
+        fail = 1;
+    }
+    bdestroy(test);
+    // now we can calculate the formula
+    if (!fail)
+        i = calculate_infix(bdata(f), result);
+    bdestroy(f);
+    return i;
+}
diff --git a/src/perfmon.c b/src/perfmon.c
index 30cacba..ee4f80f 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon.c
  *
- *      Description:  Implementation of perfmon Module.
+ *      Description:  Main implementation of the performance monitoring module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,8 +29,6 @@
  * =======================================================================================
  */
 
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -37,100 +36,18 @@
 #include <float.h>
 #include <unistd.h>
 #include <sys/types.h>
-#include <assert.h>
+
 
 #include <types.h>
+#include <likwid.h>
 #include <bitUtil.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <bitUtil.h>
-#include <error.h>
 #include <timer.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <pci.h>
 #include <lock.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <tree.h>
-#include <power.h>
-#include <thermal.h>
 #include <perfmon.h>
-#include <asciiTable.h>
 #include <registers.h>
-
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-int perfmon_verbose = 0;
-int perfmon_csvoutput = 0;
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static PerfmonGroup groupSet = _NOGROUP;
-static PerfmonEvent* eventHash;
-static PerfmonCounterMap* counter_map;
-static PerfmonGroupMap* group_map;
-static PerfmonGroupHelp* group_help;
-static EventSetup * eventSetup;
-
-static TimerData timeData;
-static double rdtscTime;
-static PerfmonEventSet perfmon_set;
-static int perfmon_numGroups;
-static int perfmon_numCounters;
-static int perfmon_numArchEvents;
-static int perfmon_numThreads;
-static int perfmon_numRegions;
-static FILE* OUTSTREAM;
-static double** perfmon_threadState;
-static PerfmonThread* perfmon_threadData;
-
-static int socket_fd = -1;
-static int socket_lock[MAX_NUM_NODES];
-
-/* #####   PROTOTYPES  -  LOCAL TO THIS SOURCE FILE   ##################### */
-
-static void initResultTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows,
-        int numColumns);
-
-static void initStatisticTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows);
-
-static void printResultTable(PerfmonResultTable* tableData);
-static void freeResultTable(PerfmonResultTable* tableData);
-static void initThread(int , int );
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define CHECKERROR \
-        if (ret == EOF) \
-        { \
-            fprintf (stderr, "sscanf: Failed to read marker file!\n" ); \
-            exit (EXIT_FAILURE);}
-
-#define bstrListAdd(bl,id,name) \
-    label = bfromcstr(#name);  \
-    (bl)->entry[id] = bstrcpy(label);  \
-    (bl)->qty++; \
-    bdestroy(label);
-
-#define INIT_EVENTS   \
-    fc = bstrListCreate(); \
-    bstrListAlloc(fc, numRows+1); \
-    bstrListAdd(fc,0,Event); \
-    for (i=0; i<numRows; i++) \
-    { \
-        fc->entry[1+i] = \
-           bfromcstr(perfmon_set.events[i].event.name); }
-
-#define INIT_BASIC  \
-    fc = bstrListCreate(); \
-    bstrListAlloc(fc, numRows+1); \
-    bstrListAdd(fc,0,Metric);
+#include <topology.h>
+#include <access.h>
+#include <perfgroup.h>
 
 #include <perfmon_pm.h>
 #include <perfmon_atom.h>
@@ -148,318 +65,176 @@ static void initThread(int , int );
 #include <perfmon_interlagos.h>
 #include <perfmon_kabini.h>
 #include <perfmon_silvermont.h>
+#include <perfmon_broadwell.h>
+#include <perfmon_skylake.h>
+
+
+PerfmonEvent* eventHash = NULL;
+RegisterMap* counter_map = NULL;
+BoxMap* box_map = NULL;
+PciDevice* pci_devices = NULL;
+int perfmon_numCounters = 0;
+int perfmon_numCoreCounters = 0;
+int perfmon_numArchEvents = 0;
+int perfmon_initialized = 0;
+int perfmon_verbosity = DEBUGLEV_ONLY_ERROR;
+uint64_t currentConfig[MAX_NUM_THREADS][NUM_PMC] = { 0 };
+
+PerfmonGroupSet* groupSet = NULL;
+LikwidResults* markerResults = NULL;
+int markerRegions = 0;
+
+int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+
+int (*initThreadArch) (int cpu_id);
+
+void perfmon_delEventSet(int groupID);
+
+char* eventOptionTypeName[NUM_EVENT_OPTIONS] = {
+    "NONE",
+    "OPCODE",
+    "MATCH0",
+    "MATCH1",
+    "MATCH2",
+    "MATCH3",
+    "MASK0",
+    "MASK1",
+    "MASK2",
+    "MASK3",
+    "NID",
+    "TID",
+    "STATE",
+    "EDGEDETECT",
+    "THRESHOLD",
+    "INVERT",
+    "KERNEL",
+    "ANYTHREAD",
+    "OCCUPANCY",
+    "OCCUPANCY_FILTER",
+    "OCCUPANCY_EDGEDETECT",
+    "OCCUPANCY_INVERT",
+    "IN_TRANSACTION",
+    "IN_TRANSACTION_ABORTED"
+};
 
-/* #####  EXPORTED  FUNCTION POINTERS   ################################### */
-void (*perfmon_startCountersThread) (int thread_id);
-void (*perfmon_stopCountersThread) (int thread_id);
-void (*perfmon_readCountersThread) (int thread_id);
-void (*perfmon_setupCounterThread) (int thread_id,
-        PerfmonEvent* event, PerfmonCounterIndex index);
-void (*printDerivedMetrics) (PerfmonGroup group);
-void (*logDerivedMetrics) (PerfmonGroup group, double time, double timeStamp);
-void (*perfmon_getDerivedCounterValuesArch)(PerfmonGroup group, float * values, float * out_max, float * out_min);
-
-
-/* #####   FUNCTION POINTERS  -  LOCAL TO THIS SOURCE FILE ################ */
-
-static void (*initThreadArch) (PerfmonThread *thread);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static int getIndex (bstring reg, PerfmonCounterIndex* index)
+static int
+getIndexAndType (bstring reg, RegisterIndex* index, RegisterType* type, int force)
 {
-    int ret = FALSE;
     int err = 0;
-    uint64_t tmp;
+    int ret = FALSE;
+    uint64_t tmp = 0x0ULL;
+    int (*ownstrcmp)(const char*, const char*);
+    ownstrcmp = &strcmp;
+    int testcpu = groupSet->threads[0].processorId;
     for (int i=0; i< perfmon_numCounters; i++)
     {
         if (biseqcstr(reg, counter_map[i].key))
         {
             *index = counter_map[i].index;
+            *type = counter_map[i].type;
             ret = TRUE;
+            break;
         }
     }
-    if ((ret) && (counter_map[*index].type != THERMAL) && (counter_map[*index].type != POWER))
-    {
-        if (counter_map[*index].device == 0)
-        {
-            tmp = msr_read(0, counter_map[*index].configRegister);
-            msr_write(0, counter_map[*index].configRegister,0x0ULL);
-        }
-        else
-        {
-            tmp = pci_read(0, counter_map[*index].device, counter_map[*index].configRegister);
-            pci_write(0, counter_map[*index].device, counter_map[*index].configRegister, 0x0U);
-        }
-    }
-    else if ((ret) && (counter_map[*index].type == POWER))
-    {
-        tmp = msr_read(0, counter_map[*index].counterRegister);
-    }
-
-    return ret;
-}
-
-
-static int
-getEvent(bstring event_str, PerfmonEvent* event)
-{
-    for (int i=0; i< perfmon_numArchEvents; i++)
-    {
-        if (biseqcstr(event_str, eventHash[i].name))
-        {
-            *event = eventHash[i];
-
-            if (perfmon_verbose)
-            {
-                fprintf(OUTSTREAM,"Found event %s : \
-                    Event_id 0x%02X Umask 0x%02X CfgBits 0x%02X Cmask 0x%02X \n",
-                        bdata( event_str),
-                        event->eventId,
-                        event->umask,
-                        event->cfgBits,
-                        event->cmask);
-            }
-            return TRUE;
-        }
-    }
-
-    return FALSE;
-}
-
-static void
-initThread(int thread_id, int cpu_id)
-{
-    for (int i=0; i<NUM_PMC; i++)
+    if (ret == FALSE)
     {
-        perfmon_threadData[thread_id].counters[i].init = FALSE;
+        fprintf(stderr, "ERROR: Counter %s not available\n",bdata(reg));
+        *type = NOTYPE;
+        return FALSE;
     }
-
-    perfmon_threadData[thread_id].processorId = cpu_id;
-    initThreadArch(&perfmon_threadData[thread_id]);
-}
-
-struct cbsScan{
-    /* Parse state */
-    bstring src;
-    int line;
-    LikwidResults* results;
-};
-
-static int lineCb (void* parm, int ofs, int len)
-{
-    int ret;
-    struct cbsScan* st = (struct cbsScan*) parm;
-    struct bstrList* strList;
-    bstring line;
-
-    if (!len) return 1;
-    strList = bstrListCreate();
-
-    line = blk2bstr (st->src->data + ofs, len);
-
-    if (st->line < perfmon_numRegions)
+    if (ret && (ownstrcmp(bdata(reg), counter_map[*index].key) != 0))
     {
-        int id;
-        strList = bsplit(line,':');
-
-        if( strList->qty < 2 )
-        {
-            ERROR_PLAIN_PRINT(Failed to read marker file);
-        }
-        ret = sscanf (bdata(strList->entry[0]), "%d", &id); CHECKERROR;
-        st->results[id].tag = bstrcpy(line);
-	 bdelete(st->results[id].tag, 0, blength(strList->entry[0])+1);
+        *type = NOTYPE;
+        return FALSE;
     }
-    else
+    err = HPMcheck(counter_map[*index].device, 0);
+    if (!err)
     {
-        int tagId;
-        int threadId;
-
-        strList = bsplit(line,32);
-
-        if( strList->qty < (3+NUM_PMC))
-        {
-            ERROR_PLAIN_PRINT(Failed to read marker file);
-        }
-
-        ret = sscanf(bdata(strList->entry[0]), "%d", &tagId); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[1]), "%d", &threadId); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[2]), "%u", &st->results[tagId].count[threadId]); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[3]), "%lf", &st->results[tagId].time[threadId]); CHECKERROR;
-
-        for (int i=0;i<NUM_PMC; i++)
-        {
-            ret = sscanf(bdata(strList->entry[4+i]), "%lf", &st->results[tagId].counters[threadId][i]); CHECKERROR;
-        }
+        *type = NOTYPE;
+        return FALSE;
     }
-
-    bstrListDestroy(strList);
-    st->line++;
-    bdestroy(line);
-    return 1;
-}
-
-static void
-readMarkerFile(bstring filename, LikwidResults** resultsRef)
-{
-    int numberOfThreads=0;
-    int ret;
-    int i,j,k;
-    struct cbsScan sl;
-    FILE * fp;
-    LikwidResults* results = *resultsRef;
-
-    if (NULL != (fp = fopen (bdata(filename), "r")))
+    if ((ret) && (*type != THERMAL) && (*type != POWER) && (*type != WBOX0FIX))
     {
-        bstring src = bread ((bNread) fread, fp);
-
-        /* read header info */
-        ret = sscanf (bdata(src), "%d %d", &numberOfThreads, &perfmon_numRegions); CHECKERROR;
-        results = (LikwidResults*) malloc(perfmon_numRegions * sizeof(LikwidResults));
-
-        if (perfmon_numRegions == 0)
-        {
-            fprintf(OUTSTREAM,"ERROR: No region results are listed in marker file\n");
-            ERROR_PLAIN_PRINT(No region results in marker file);
-        }
-        else if (numberOfThreads != perfmon_numThreads)
+        int check_settings = 1;
+        uint32_t reg = counter_map[*index].configRegister;
+        if (reg == 0x0)
         {
-            fprintf(OUTSTREAM,"ERROR: Is the number of threads for likwid-perfctr equal to the number in the measured application?\n");
-            fprintf(OUTSTREAM,"likwid_markerInit and likwid_markerClose must be called in serial region.\n");
-
-            ERROR_PRINT(Number of threads %d in marker file unequal to number of threads in likwid-perfCtr %d,numberOfThreads,perfmon_numThreads);
+            reg = counter_map[*index].counterRegister;
+            check_settings = 0;
         }
-
-        /* allocate  LikwidResults struct */
-        for (i=0;i<perfmon_numRegions; i++)
+        err = HPMread(testcpu, counter_map[*index].device, reg, &tmp);
+        if (err != 0)
         {
-            results[i].time = (double*) malloc(numberOfThreads * sizeof(double));
-            results[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
-            results[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
-            for (j=0;j<numberOfThreads; j++)
+            if (err == -ENODEV)
             {
-                results[i].time[j] = 0.0;
-                results[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
-
-                for (k=0;k<NUM_PMC; k++)
-                {
-                        results[i].counters[j][k] = 0.0;
-                }
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+                                         pci_devices[box_map[*type].device].name);
             }
+            else
+            {
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+                                             counter_map[*index].key);
+            }
+            *type = NOTYPE;
+            ret = FALSE;
         }
-
-        sl.src = src;
-        sl.line = 0;
-        sl.results = results;
-        bsplitcb (src, (char) '\n', bstrchr(src,10)+1, lineCb, &sl);
-
-        fclose (fp);
-        bdestroy (src);
-    }
-    else
-    {
-        fprintf(OUTSTREAM,"ERROR: The marker result file could not be found!\n");
-        fprintf(OUTSTREAM,"Did you call likwid_markerClose() at the end of your measurement?\n");
-        ERROR;
-    }
-
-    *resultsRef = results;
-    bstring exeString = bformat("rm  -f %s",bdata(filename));
-    ret = system(bdata(exeString));
-
-    if (ret == EOF)
-    {
-        ERROR;
-    }
-
-    bdestroy(exeString);
-}
-
-static void
-printResultTable(PerfmonResultTable * tableData)
-{
-    if (perfmon_csvoutput)
-    {
-        int r, c;
-        for (c = 0; c < tableData->header->qty; c++)
-        {
-            fprintf(OUTSTREAM, "%s%s", ((c == 0) ? "\n" : ","), tableData->header->entry[c]->data);
-        }
-        fprintf(OUTSTREAM, "%s", "\n");
-
-        for (r = 0; r < tableData->numRows; r++)
+        else if (tmp == 0x0ULL)
         {
-            fprintf(OUTSTREAM, "%s", tableData->rows[r].label->data);
-
-            for (c = 0; c < tableData->numColumns; c++)
+            err = HPMwrite(testcpu, counter_map[*index].device, reg, 0x0ULL);
+            if (err != 0)
             {
-                if (!isnan(tableData->rows[r].value[c]))
+                if (err == -ENODEV)
                 {
-                    fprintf(OUTSTREAM, ",%lf", tableData->rows[r].value[c]);
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+                                             pci_devices[box_map[*type].device].name);
                 }
                 else
                 {
-                    fprintf(OUTSTREAM, ",%s", "nan");
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not writeable on this machine,
+                                             counter_map[*index].key);
                 }
+                *type = NOTYPE;
+                ret = FALSE;
             }
-            fprintf(OUTSTREAM, "%s", "\n");
+            check_settings = 0;
         }
-        fprintf(OUTSTREAM, "%s", "\n");
-    }
-    else
-    {
-        int i,j;
-        TableContainer* table;
-        bstrList* labelStrings = NULL;
-        bstring label = bfromcstr("NO");
-
-        table = asciiTable_allocate(tableData->numRows,
-                tableData->numColumns+1,
-                tableData->header);
-        asciiTable_setOutput(OUTSTREAM);
-
-        labelStrings = bstrListCreate();
-        bstrListAlloc(labelStrings, tableData->numColumns+1);
-
-        for (i=0; i<tableData->numRows; i++)
+        if ((check_settings) && (tmp != 0x0ULL))
         {
-            labelStrings->qty = 0;
-            labelStrings->entry[0] = bstrcpy(tableData->rows[i].label);
-            labelStrings->qty++;
-
-            for (j=0; j<(tableData->numColumns);j++)
+            if (force == 1)
+            {
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s has bits set (0x%llx) but we are forced to overwrite them,
+                                             counter_map[*index].key, tmp);
+                err = HPMwrite(testcpu, counter_map[*index].device, reg, 0x0ULL);
+            }
+            else if ((force == 0) && ((*type != FIXED)&&(*type != THERMAL)&&(*type != POWER)&&(*type != WBOX0FIX)))
             {
-                label = bformat("%g", tableData->rows[i].value[j]);
-                labelStrings->entry[1+j] = bstrcpy(label);
-                labelStrings->qty++;
+                fprintf(stderr, "ERROR: The selected register %s is in use.\n", counter_map[*index].key);
+                fprintf(stderr, "Please run likwid with force option (-f, --force) to overwrite settings\n");
+                exit(EXIT_SUCCESS);
             }
-            asciiTable_appendRow(table,labelStrings);
         }
-
-        asciiTable_print(table);
-        bdestroy(label);
-        bstrListDestroy(labelStrings);
-        asciiTable_free(table);
     }
-}
-
-static int
-getGroupId(bstring groupStr,PerfmonGroup* group)
-{
-    *group = _NOGROUP;
-
-    for (int i=0; i<perfmon_numGroups; i++)
+    else if ((ret) && ((*type == POWER) || (*type == WBOX0FIX) || (*type == THERMAL)))
     {
-        if (biseqcstr(groupStr,group_map[i].key))
+        err = HPMread(testcpu, MSR_DEV, counter_map[*index].counterRegister, &tmp);
+        if (err != 0)
         {
-            *group = group_map[i].index;
-            return i;
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+                                         counter_map[*index].key);
+            *type = NOTYPE;
+            ret = FALSE;
         }
     }
-
-    return -1;
+    else
+    {
+        *type = NOTYPE;
+        ret = FALSE;
+    }
+    return ret;
 }
 
 static int
@@ -467,795 +242,377 @@ checkCounter(bstring counterName, const char* limit)
 {
     int i;
     struct bstrList* tokens;
-    int value = FALSE;
+    int ret = FALSE;
     bstring limitString = bfromcstr(limit);
 
-    tokens = bstrListCreate();
     tokens = bsplit(limitString,'|');
-
     for(i=0; i<tokens->qty; i++)
     {
         if(bstrncmp(counterName, tokens->entry[i], blength(tokens->entry[i])))
         {
-            value = FALSE;
+            ret = FALSE;
         }
         else
         {
-            value = TRUE;
+            ret = TRUE;
             break;
         }
     }
-
     bdestroy(limitString);
     bstrListDestroy(tokens);
-    return value;
+    return ret;
 }
 
-static void
-freeResultTable(PerfmonResultTable* tableData)
+static int
+getEvent(bstring event_str, bstring counter_str, PerfmonEvent* event)
 {
-    int i;
-
-    bstrListDestroy(tableData->header);
-
-    for (i=0; i<tableData->numRows; i++)
+    int ret = FALSE;
+    int (*ownstrncmp)(const char *, const char *, size_t);
+    ownstrncmp = &strncmp;
+    for (int i=0; i< perfmon_numArchEvents; i++)
     {
-        free(tableData->rows[i].value);
+        if (biseqcstr(event_str, eventHash[i].name))
+        {
+            if (!checkCounter(counter_str, eventHash[i].limit))
+            {
+                continue;
+            }
+            *event = eventHash[i];
+            ret = TRUE;
+            break;
+        }
     }
 
-    free(tableData->rows);
+    return ret;
 }
 
-static void
-initResultTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows,
-        int numColumns)
+static int
+assignOption(PerfmonEvent* event, bstring entry, int index, EventOptionType type, int zero_value)
 {
-    int i;
-    bstrList* header;
-    bstring label;
-
-    header = bstrListCreate();
-    bstrListAlloc(header, numColumns+1);
-    header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
-    for (i=0; i<perfmon_numThreads;i++)
+    int found_double = -1;
+    int return_index = index;
+    long long unsigned int value;
+    for (int k = 0; k < index; k++)
     {
-        label = bformat("core %d",perfmon_threadData[i].processorId);
-        header->entry[1+i] = bstrcpy(label); header->qty++;
+        if (event->options[k].type == type)
+        {
+            found_double = k;
+            break;
+        }
     }
-
-    tableData->numRows = numRows;
-    tableData->numColumns = numColumns;
-    tableData->header = header;
-    tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
-    for (i=0; i<numRows; i++)
+    if (found_double >= 0)
     {
-        tableData->rows[i].label = firstColumn->entry[1+i];
-        tableData->rows[i].value =
-            (double*) malloc((numColumns)*sizeof(double));
+        index = found_double;
     }
-}
-
-static void
-initStatisticTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows)
-{
-    int i;
-    int numColumns = 4;
-    bstrList* header;
-    bstring label;
-
-    header = bstrListCreate();
-    bstrListAlloc(header, numColumns+1);
-    header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
-    label = bformat("Sum");
-    header->entry[1] = bstrcpy(label); header->qty++;
-    label = bformat("Max");
-    header->entry[2] = bstrcpy(label); header->qty++;
-    label = bformat("Min");
-    header->entry[3] = bstrcpy(label); header->qty++;
-    label = bformat("Avg");
-    header->entry[4] = bstrcpy(label); header->qty++;
-
-    tableData->numRows = numRows;
-    tableData->numColumns = numColumns;
-    tableData->header = header;
-    tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
-    for (i=0; i<numRows; i++)
+    else
     {
-        tableData->rows[i].label = firstColumn->entry[1+i];
-        bcatcstr(tableData->rows[i].label," STAT");
-        tableData->rows[i].value =
-            (double*) malloc((numColumns)*sizeof(double));
+        return_index++;
     }
-}
-
-static void printDerivedMetricsFixed(void)
-{
-    int threadId;
-    double time = rdtscTime;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-    PerfmonResultTable tableData;
-    int numRows;
-    int numColumns = perfmon_numThreads;
-    bstring label;
-    bstrList* fc;
-    double tmpValue;
-
-    numRows = 4;
-    INIT_BASIC;
-
-    bstrListAdd(fc,1,Runtime (RDTSC) [s]);
-    bstrListAdd(fc,2,Runtime unhalted [s]);
-    bstrListAdd(fc,3,Clock [MHz]);
-    bstrListAdd(fc,4,CPI);
-
-    initResultTable(&tableData, fc, numRows, numColumns);
-
-    for(threadId=0; threadId < perfmon_numThreads; threadId++)
-    {
-        tmpValue = time;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[0].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[0].value[threadId] = 0.0;
-        }
-
-        tmpValue = perfmon_getResult(threadId,"FIXC1")*inverseClock;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[1].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[1].value[threadId] = 0.0;
-        }
-
-        tmpValue = 1.E-06*(perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC2"))/inverseClock;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[2].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[2].value[threadId] = 0.0;
-        }
-
-        tmpValue = perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC0");
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[3].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[3].value[threadId] = 0.0;
-        }
-
+    event->options[index].type = type;
+    if (zero_value)
+    {
+        event->options[index].value = 0;
     }
-    printResultTable(&tableData);
-    freeResultTable(&tableData);
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-perfmon_setCSVMode(int v)
-{
-    perfmon_csvoutput = v;
-}
-
-void
-perfmon_printCounters(void)
-{
-    fprintf(OUTSTREAM,"This architecture has %d counters.\n", perfmon_numCounters);
-    fprintf(OUTSTREAM,"Counters names:  ");
-
-    for (int i=0; i<perfmon_numCounters; i++)
+    else
     {
-        fprintf(OUTSTREAM,"%s\t",counter_map[i].key);
+        value = 0;
+        sscanf(bdata(entry), "%llx", &value);
+        event->options[index].value = value;
     }
-    fprintf(OUTSTREAM,".\n");
+    return return_index;
 }
 
-void
-perfmon_printEvents(void)
+static int
+parseOptions(struct bstrList* tokens, PerfmonEvent* event, RegisterIndex index)
 {
-    int i;
+    int i,j;
+    struct bstrList* subtokens;
 
-    fprintf(OUTSTREAM,"This architecture has %d events.\n", perfmon_numArchEvents);
-    fprintf(OUTSTREAM,"Event tags (tag, id, umask, counters):\n");
-
-    for (i=0; i<perfmon_numArchEvents; i++)
+    for (i = event->numberOfOptions; i < MAX_EVENT_OPTIONS; i++)
     {
-        fprintf(OUTSTREAM,"%s, 0x%X, 0x%X, %s \n",
-                eventHash[i].name,
-                eventHash[i].eventId,
-                eventHash[i].umask,
-                eventHash[i].limit);
+        event->options[i].type = EVENT_OPTION_NONE;
     }
-}
-
-
-double
-perfmon_getResult(int threadId, char* counterString)
-{
-    bstring counter = bfromcstr(counterString);
-    PerfmonCounterIndex  index;
-
-   if (getIndex(counter,&index))
-   {
-           return perfmon_threadData[threadId].counters[index].counterData;
-   }
-
-   fprintf (stderr, "perfmon_getResult: Failed to get counter Index!\n" );
-   return 0.0;
-}
-
-
-void
-perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set)
-{
-    set->numberOfEvents = eventSetConfig->numberOfEvents;
-    set->events = (PerfmonEventSetEntry*)
-        malloc(set->numberOfEvents * sizeof(PerfmonEventSetEntry));
 
-    for (int i=0; i<set->numberOfEvents; i++)
+    if (tokens->qty-2 > MAX_EVENT_OPTIONS)
     {
-        /* get register index */
-        if (!getIndex(eventSetConfig->events[i].counterName,
-                    &set->events[i].index))
-        {
-            ERROR_PRINT(Counter register %s not supported,bdata(
-                  eventSetConfig->events[i].counterName));
-        }
-
-        /* setup event */
-        if (!getEvent(eventSetConfig->events[i].eventName,
-                    &set->events[i].event))
-        {
-            ERROR_PRINT(Event %s not found for current architecture,
-                bdata(eventSetConfig->events[i].eventName));
-        }
-
-        /* is counter allowed for event */
-        if (!checkCounter(eventSetConfig->events[i].counterName,
-                    set->events[i].event.limit))
-        {
-            ERROR_PRINT(Register not allowed  for event  %s,
-                bdata(eventSetConfig->events[i].eventName));
-        }
+        return -ERANGE;
     }
-}
 
-void
-perfmon_printMarkerResults(bstring filepath)
-{
-    int i;
-    int j;
-    int region;
-    LikwidResults* results = NULL;
-    PerfmonResultTable tableData;
-    PerfmonResultTable regionData;
-    int numRows = perfmon_set.numberOfEvents;
-    int numColumns = perfmon_numThreads;
-    bstrList* fc;
-    bstrList* regionLabels;
-    bstring label;
-    INIT_EVENTS;
-
-    readMarkerFile(filepath, &results);
-    initResultTable(&tableData, fc, numRows, numColumns);
-    regionLabels = bstrListCreate();
-    bstrListAlloc(regionLabels, 3);
-    bstrListAdd(regionLabels, 0, Region Info);
-    bstrListAdd(regionLabels, 1, RDTSC Runtime [s]);
-    bstrListAdd(regionLabels, 2, call count);
-
-    for (region=0; region<perfmon_numRegions; region++)
-    {
-        initResultTable(&tableData, fc, numRows, numColumns);
-        fprintf(OUTSTREAM,"\n=====================\n");
-        fprintf(OUTSTREAM,"Region: %s \n", bdata(results[region].tag));
-        fprintf(OUTSTREAM,"=====================\n");
-        initResultTable(&regionData, regionLabels, 2, numColumns);
-
-        for (j=0; j<numColumns; j++)
-        {
-            regionData.rows[0].value[j] = results[region].time[j];
-            regionData.rows[1].value[j] = (double) results[region].count[j];
-        }
-        printResultTable(&regionData);
 
-        for (i=0; i<numRows; i++)
+    for (i=2;i<tokens->qty;i++)
+    {
+        subtokens = bsplit(tokens->entry[i],'=');
+        btolower(subtokens->entry[0]);
+        if (subtokens->qty == 1)
         {
-            for (j=0; j<numColumns; j++)
+            if (biseqcstr(subtokens->entry[0], "edgedetect") == 1)
             {
-                tableData.rows[i].value[j] =
-                    results[region].counters[j][perfmon_set.events[i].index];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_EDGE, 1);
             }
-        }
-
-        printResultTable(&tableData);
-
-        for (j=0; j<numColumns; j++)
-        {
-            for (i=0; i<numRows; i++)
+            else if (biseqcstr(subtokens->entry[0], "invert") == 1)
             {
-                perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData =
-                    results[region].counters[j][perfmon_set.events[i].index];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_INVERT, 1);
             }
+            else if (biseqcstr(subtokens->entry[0], "kernel") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_COUNT_KERNEL, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "anythread") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_ANYTHREAD, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_edgedetect") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_EDGE, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_invert") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_INVERT, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "in_trans") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_IN_TRANS, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "in_trans_aborted") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_IN_TRANS_ABORT, 1);
+            }
+            else
+            {
+                continue;
+            }
+            event->options[event->numberOfOptions].value = 0;
         }
-        rdtscTime = results[region].time[0];
-        if (groupSet != _NOGROUP)
-        {
-            printDerivedMetrics(groupSet);
-        }
-        else if ( cpuid_info.family == P6_FAMILY )
+        else if (subtokens->qty == 2)
         {
-            printDerivedMetricsFixed();
-        }
-    }
-
-    for (i=0;i<perfmon_numRegions; i++)
-    {
-        for (j=0;j<perfmon_numThreads; j++)
-        {
-            free(results[i].counters[j]);
-        }
-
-        free(results[i].counters);
-        free(results[i].time);
-    }
-
-    freeResultTable(&tableData);
-    freeResultTable(&regionData);
-    bstrListDestroy(fc);
-    bstrListDestroy(regionLabels);
-}
-
-void
-perfmon_logCounterResults(double time)
-{
-    int i;
-    int j;
-    double tmp;
-    static double timeStamp = 0.0;
-
-    timeStamp += time;
-
-    for (i=0; i<perfmon_set.numberOfEvents; i++)
-    {
-        fprintf(OUTSTREAM, "%s %e ", perfmon_set.events[i].event.name, timeStamp);
-        for (j=0; j<perfmon_numThreads; j++)
-        {
-            fprintf(OUTSTREAM, "%e ",
-                    (double) (perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData) - perfmon_threadState[j][perfmon_set.events[i].index]);
-            perfmon_threadState[j][perfmon_set.events[i].index] = perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-        }
-        fprintf(OUTSTREAM,"\n");
-    }
-
-    if (groupSet != _NOGROUP)
-    {
-        logDerivedMetrics(groupSet, time, timeStamp);
-    }
-
-    fflush(OUTSTREAM);
-}
-
-void
-perfmon_printCounterResults()
-{
-    int i;
-    int j;
-    PerfmonResultTable tableData;
-    int numRows = perfmon_set.numberOfEvents;
-    int numColumns = perfmon_numThreads;
-    double stat[perfmon_set.numberOfEvents][4]; /* 0:sum, 1:max, 2:min, 3:avg */
-    bstrList* fc;
-    bstring label;
-    INIT_EVENTS;
-
-    for (i=0; i<numRows; i++)
-    {
-        stat[i][0] = 0;
-        stat[i][1] = 0;
-        stat[i][2] = DBL_MAX;
-    }
-
-    initResultTable(&tableData, fc, numRows, numColumns);
-
-    /* print raw event data */
-    for (i=0; i<numRows; i++)
-    {
-        for (j=0; j<numColumns; j++)
-        {
-            tableData.rows[i].value[j] =
-                (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-            stat[i][0] +=
-                (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-            stat[i][1] =  MAX(stat[i][1],
-                    (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
-            stat[i][2] =  MIN(stat[i][2],
-                    (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
-        }
-    }
-    printResultTable(&tableData);
-    freeResultTable(&tableData);
-
-
-    /* for threaded results print sum, max, min and avg */
-    if (perfmon_numThreads > 1)
-    {
-        initStatisticTable(&tableData, fc, numRows);
-
-        for (i=0; i<numRows; i++)
-        {
-            stat[i][3] =  stat[i][0]/perfmon_numThreads;
-
-            for (j=0; j<4; j++)
+            if (biseqcstr(subtokens->entry[0], "opcode") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OPCODE, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match0") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH0, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match1") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH1, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match2") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH2, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match3") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH3, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask0") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK0, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask1") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK1, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask2") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK2, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask3") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK3, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "nid") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_NID, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "tid") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_TID, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "state") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_STATE, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "threshold") == 1)
             {
-                tableData.rows[i].value[j] = stat[i][j];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_THRESHOLD, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occupancy") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_filter") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_FILTER, 0);
+            }
+            else
+            {
+                continue;
             }
         }
-        printResultTable(&tableData);
-        freeResultTable(&tableData);
-    }
-
-    if (groupSet != _NOGROUP)
-    {
-        /* print derived metrics */
-        printDerivedMetrics(groupSet);
+        bstrListDestroy(subtokens);
     }
-    else if ( cpuid_info.family == P6_FAMILY )
+    for(i=event->numberOfOptions-1;i>=0;i--)
     {
-        printDerivedMetricsFixed();
-    }
-}
-
-double
-perfmon_getEventResult(int thread, int index)
-{
-    return (double) perfmon_threadData[thread].counters[perfmon_set.events[index].index].counterData;
-}
-
-EventSetup perfmon_prepareEventSetup(char* eventGroupString){
-     EventSetup setup;
-     bstring eventString = bfromcstr(eventGroupString);
-
-     setup.eventSetConfig = malloc(sizeof(setup.eventSetConfig));
-     setup.perfmon_set = malloc(sizeof(setup.perfmon_set));
-
-     int groupId = getGroupId(eventString, & setup.groupSet);
-     setup.groupName = strdup(eventGroupString);
-     setup.groupIndex = groupId;
-     if (setup.groupSet == _NOGROUP)
-     {
-        /* eventString is a custom eventSet */
-        bstr_to_eventset(setup.eventSetConfig, eventString);
-     }
-     else
-     {
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(setup.eventSetConfig, eventString);
-     }
-
-     perfmon_initEventSet(setup.eventSetConfig, setup.perfmon_set);
-     bdestroy(eventString);
-
-     setup.eventNames = (const char**) malloc(setup.perfmon_set->numberOfEvents * sizeof(const char*));
-
-     setup.numberOfEvents = setup.perfmon_set->numberOfEvents;
-     for (int i=0; i< setup.perfmon_set->numberOfEvents; i++)
-     {
-        setup.eventNames[i] = setup.perfmon_set->events[i].event.name;
-     }
-
-     setup.numberOfDerivedCounters = group_map[groupId].derivedCounters;
-     setup.derivedNames = (const char**) malloc(setup.numberOfDerivedCounters * sizeof(const char*));
-
-     for(int i=0; i < group_map[groupId].derivedCounters; i++){
-        setup.derivedNames[i] = group_map[groupId].derivedCounterNames[i];
-     }
-
-     return setup;
-}
-
-
-void perfmon_setupCountersForEventSet(EventSetup * setup){
-    perfmon_set = *setup->perfmon_set;
-    groupSet = setup->groupSet;
-    eventSetup = setup;
-    perfmon_setupCounters();
-}
-
-void perfmon_getEventCounterValues(uint64_t * values, uint64_t * out_max, uint64_t * out_min){
-
-    for(int e = 0; e < perfmon_set.numberOfEvents; e++ ){
-        uint64_t sum = 0;
-        uint64_t min = (uint64_t) -1;
-        uint64_t max = 0;
-
-        for(int i = 0; i < perfmon_numThreads; i++){
-            uint64_t cur = perfmon_threadData[i].counters[e].counterData;
-            sum += cur;
-            max = max > cur ? max : cur;
-            min = min < cur ? min : cur;
+        if (!(OPTIONS_TYPE_MASK(event->options[i].type) & (counter_map[index].optionMask|event->optionMask)))
+        {
+            DEBUG_PRINT(DEBUGLEV_INFO,Removing Option %s not valid for register %s,
+                        eventOptionTypeName[event->options[i].type],
+                        counter_map[index].key);
+            event->options[i].type = EVENT_OPTION_NONE;
+            event->numberOfOptions--;
         }
-        values[e] = sum / perfmon_numThreads;
-        out_min[e] = min;
-        out_max[e] = max;
-    }
-}
-
-void perfmon_getDerivedCounterValues(float * values, float * out_max, float * out_min){
-    perfmon_getDerivedCounterValuesArch(eventSetup->groupSet, values, out_max, out_min);
-}
-
-int
-perfmon_setupEventSetC(char* eventCString, const char*** eventnames)
-{
-     int i;
-     bstring eventString = bfromcstr(eventCString);
-     StrUtilEventSet eventSetConfig;
-     int groupId;
-
-     groupId = getGroupId(eventString, &groupSet);
-     if (groupSet == _NOGROUP)
-     {
-        /* eventString is a custom eventSet */
-        bstr_to_eventset(&eventSetConfig, eventString);
-     }
-     else
-     {
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(&eventSetConfig, eventString);
-     }
-
-     perfmon_initEventSet(&eventSetConfig, &perfmon_set);
-     perfmon_setupCounters();
-     bdestroy(eventString);
-
-     (*eventnames) = (const char**) malloc(perfmon_set.numberOfEvents * sizeof(const char*));
-
-     for (i=0; i<perfmon_set.numberOfEvents; i++)
-     {
-         (*eventnames)[i] = perfmon_set.events[i].event.name;
-     }
-
-     return perfmon_set.numberOfEvents;
-}
-
-void
-perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
-{
-    int groupId;
-    int eventBool = FALSE;
-    StrUtilEventSet eventSetConfig;
-    PerfmonEvent eventSet;
-    struct bstrList* subStr;
-
-    groupId = getGroupId(eventString, &groupSet);
-
-    if (groupSet == _NOGROUP)
-    {
-        subStr = bstrListCreate();
-        subStr = bsplit(eventString,':');
-        eventBool = getEvent(subStr->entry[0], &eventSet);
-        bstrListDestroy(subStr);
     }
 
-    if (groupSet == _NOGROUP && eventBool != FALSE)
+    for(i=0;i<event->numberOfOptions;i++)
     {
-        /* eventString is a custom eventSet */
-        /* append fixed counters for Intel processors */
-        if ( cpuid_info.family == P6_FAMILY )
+        if (event->options[i].type == EVENT_OPTION_EDGE)
         {
-            if (cpuid_info.perf_num_fixed_ctr > 0)
+            int threshold_set = FALSE;
+            for (j=0;j<event->numberOfOptions;j++)
             {
-                bcatcstr(eventString,",INSTR_RETIRED_ANY:FIXC0");
+                if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+                {
+                    threshold_set = TRUE;
+                    break;
+                }
             }
-            if (cpuid_info.perf_num_fixed_ctr > 1)
+            if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS))
             {
-                bcatcstr(eventString,",CPU_CLK_UNHALTED_CORE:FIXC1");
+                event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+                event->options[event->numberOfOptions].value = 0x1;
+                event->numberOfOptions++;
             }
-            if (cpuid_info.perf_num_fixed_ctr > 2)
+            else
             {
-                bcatcstr(eventString,",CPU_CLK_UNHALTED_REF:FIXC2");
+                ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
             }
         }
-        bstr_to_eventset(&eventSetConfig, eventString);
-    }
-    else if (groupId < 0 && eventBool == FALSE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported group or event for this architecture!);
-        exit(EXIT_FAILURE);
-    }
-    else
-    {
-        if ( group_map[groupId].isUncore )
+        else if (event->options[i].type == EVENT_OPTION_OCCUPANCY)
         {
-            if ( (cpuid_info.model != SANDYBRIDGE_EP) &&
-                    (cpuid_info.model != IVYBRIDGE_EP) &&
-                    (cpuid_info.model != WESTMERE_EX) &&
-                    (cpuid_info.model != NEHALEM_EX))
+            int threshold_set = FALSE;
+            int edge_set = FALSE;
+            int invert_set = FALSE;
+            for (j=0;j<event->numberOfOptions;j++)
+            {
+                if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+                {
+                    threshold_set = TRUE;
+                    break;
+                }
+                if (event->options[i].type == EVENT_OPTION_EDGE)
+                {
+                    edge_set = TRUE;
+                    break;
+                }
+                if (event->options[i].type == EVENT_OPTION_INVERT)
+                {
+                    invert_set = TRUE;
+                    break;
+                }
+            }
+            if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS) &&
+                (edge_set == TRUE || invert_set == TRUE ))
             {
-                ERROR_PLAIN_PRINT(Uncore not supported on Desktop processors!);
-                exit(EXIT_FAILURE);
+                event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+                event->options[event->numberOfOptions].value = 0x1;
+                event->numberOfOptions++;
             }
-        }
-
-        fprintf(OUTSTREAM,"Measuring group %s\n", group_map[groupId].key);
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(&eventSetConfig, eventString);
-    }
-
-    perfmon_initEventSet(&eventSetConfig, &perfmon_set);
-    perfmon_setupCounters();
-
-    if ( counterMask != NULL )
-    {
-        bitMask_init((*counterMask));
-        /* Extract counter mask from first thread */
-        for (int index=0; index<perfmon_numCounters; index++)
-        {
-            if ( perfmon_threadData[0].counters[index].init == TRUE )
+            else
             {
-                bitMask_set((*counterMask),index);
+                ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
             }
         }
     }
-}
 
+    
+    return event->numberOfOptions;
+}
 
-void
-perfmon_setupCounters()
+static double
+calculateResult(int groupId, int eventId, int threadId)
 {
-    for (int j=0; j<perfmon_set.numberOfEvents; j++)
+    PerfmonEventSetEntry* event;
+    PerfmonCounter* counter;
+    int cpu_id;
+    double result = 0.0;
+    if (groupSet->groups[groupId].events[eventId].type == NOTYPE)
+        return result;
+
+    event = &(groupSet->groups[groupId].events[eventId]);
+    counter = &(event->threadCounter[threadId]);
+    if (counter->overflows == 0)
     {
-        for (int i=0; i<perfmon_numThreads; i++)
-        {
-            perfmon_setupCounterThread(i,
-                    &perfmon_set.events[j].event,
-                    perfmon_set.events[j].index);
-        }
+        result = (double) (counter->counterData - counter->startData);
     }
-}
-
-void
-perfmon_startCounters(void)
-{
-    for (int i=0;i<perfmon_numThreads;i++)
+    else if (counter->overflows > 0)
     {
-        perfmon_startCountersThread(i);
+        result += (double) ((perfmon_getMaxCounterValue(counter_map[event->index].type) - counter->startData) + counter->counterData);
+        counter->overflows--;
     }
-
-    timer_start(&timeData);
-}
-
-void
-perfmon_stopCounters(void)
-{
-    int i;
-
-    timer_stop(&timeData);
-
-    for (i=0;i<perfmon_numThreads;i++)
+    result += (double) (counter->overflows * perfmon_getMaxCounterValue(counter_map[event->index].type));
+    if (counter_map[event->index].type == POWER)
     {
-        perfmon_stopCountersThread(i);
+        result *= power_getEnergyUnit(getCounterTypeOffset(event->index));
     }
-
-    rdtscTime = timer_print(&timeData);
-}
-
-void
-perfmon_readCounters(void)
-{
-    int i;
-
-    for (i=0;i<perfmon_numThreads;i++)
+    else if (counter_map[event->index].type == THERMAL)
     {
-        perfmon_readCountersThread(i);
+        result = (double)counter->counterData;
     }
+    return result;
 }
 
-
-void
-perfmon_printAvailableGroups()
+int
+getCounterTypeOffset(int index)
 {
-    int i;
-
-    fprintf(OUTSTREAM,"Available groups on %s:\n",cpuid_info.name);
-
-    for(i=0; i<perfmon_numGroups; i++)
+    int off = 0;
+    for (int j=index-1;j>=0;j--)
     {
-        if ( group_map[i].isUncore )
+        if (counter_map[index].type == counter_map[j].type)
         {
-            if ( (cpuid_info.model == SANDYBRIDGE_EP) ||
-                 (cpuid_info.model == IVYBRIDGE_EP) ||
-                 (cpuid_info.model == WESTMERE_EX) ||
-                 (cpuid_info.model == NEHALEM_EX))
-            {
-                fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
-                        group_map[i].info);
-            }
+            off++;
         }
         else
         {
-            fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
-                    group_map[i].info);
+            break;
         }
     }
+    return off;
 }
 
-void
-perfmon_printGroupHelp(bstring group)
+void perfmon_setVerbosity(int level)
 {
-    int i;
-    PerfmonGroup groupDummy;
-
-    if ((i = getGroupId(group,&groupDummy))<0)
-    {
-        ERROR_PLAIN_PRINT(Group not found);
-    }
-    else
-    {
-        fprintf(OUTSTREAM,"Group %s:\n",bdata(group));
-        fprintf(OUTSTREAM,"%s",group_help[i].msg);
-    }
+    if ((level >= DEBUGLEV_ONLY_ERROR) && (level <= DEBUGLEV_DEVELOP))
+        perfmon_verbosity = level;
 }
 
-
-
 void
-perfmon_init(int numThreads_local, int threads[], FILE* outstream)
+perfmon_init_maps(void)
 {
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to performance counters is locked.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    perfmon_numThreads = numThreads_local;
-    perfmon_threadData = (PerfmonThread*)
-        malloc(perfmon_numThreads * sizeof(PerfmonThread));
-    /* This is specific for daemon mode. */
-    perfmon_threadState = (double**)
-        malloc(perfmon_numThreads * sizeof(double*));
-
-    for (int i=0; i<perfmon_numThreads; i++)
-    {
-        perfmon_threadState[i] = (double*)
-            malloc(NUM_PMC * sizeof(double));
-        for(int j=0; j<NUM_PMC;j++)
-        {
-            perfmon_threadState[i][j] = 0.0;
-        }
-    }
-
-    OUTSTREAM = outstream;
-
-    for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
-    {
-        accessClient_init(&socket_fd);
-    }
-
-    msr_init(socket_fd);
-
+    box_map = NULL;
     switch ( cpuid_info.family )
     {
         case P6_FAMILY:
@@ -1263,75 +620,37 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             switch ( cpuid_info.model )
             {
                 case PENTIUM_M_BANIAS:
-
                 case PENTIUM_M_DOTHAN:
-
                     eventHash = pm_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEvents_pm;
-
-                    group_map = pm_group_map;
-                 //   group_help = pm_group_help;
-                    perfmon_numGroups = perfmon_numGroups_pm;
-
                     counter_map = pm_counter_map;
+                    box_map = pm_box_map;
                     perfmon_numCounters = perfmon_numCounters_pm;
-
-                    initThreadArch = perfmon_init_pm;
-                    printDerivedMetrics = perfmon_printDerivedMetrics_pm;
-                    assert(FALSE && "NOT SUPPORTED");
-                    perfmon_startCountersThread = perfmon_startCountersThread_pm;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_pm;
                     break;
 
                 case ATOM_45:
-
                 case ATOM_32:
-
                 case ATOM_22:
-
                 case ATOM:
-
                     eventHash = atom_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsAtom;
-
-                    group_map = atom_group_map;
-                    group_help = atom_group_help;
-                    perfmon_numGroups = perfmon_numGroupsAtom;
-
                     counter_map = core2_counter_map;
                     perfmon_numCounters = perfmon_numCountersCore2;
-
-                    initThreadArch = perfmon_init_core2;
-                    printDerivedMetrics = perfmon_printDerivedMetricsAtom;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesAtom;
-                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+                    box_map = core2_box_map;
                     break;
 
-                case ATOM_SILVERMONT_C:
                 case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    power_init(0);
-                    thermal_init(0);
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                case ATOM_SILVERMONT_AIR:
                     eventHash = silvermont_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsSilvermont;
-
-                    group_map = silvermont_group_map;
-                    group_help = silvermont_group_help;
-                    perfmon_numGroups = perfmon_numGroupsSilvermont;
-
                     counter_map = silvermont_counter_map;
+                    box_map = silvermont_box_map;
                     perfmon_numCounters = perfmon_numCountersSilvermont;
-
-                    initThreadArch = perfmon_init_silvermont;
-                    printDerivedMetrics = perfmon_printDerivedMetricsSilvermont;
-                    perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_silvermont;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSilvermont;
                     break;
 
                 case CORE_DUO:
@@ -1339,216 +658,370 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
                     break;
 
                 case XEON_MP:
-
                 case CORE2_65:
-
                 case CORE2_45:
-
                     eventHash = core2_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsCore2;
-
-                    group_map = core2_group_map;
-                    group_help = core2_group_help;
-                    perfmon_numGroups = perfmon_numGroupsCore2;
-
                     counter_map = core2_counter_map;
                     perfmon_numCounters = perfmon_numCountersCore2;
-
-                    initThreadArch = perfmon_init_core2;
-                    printDerivedMetrics = perfmon_printDerivedMetricsCore2;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesCore2;
-
-                    logDerivedMetrics = perfmon_logDerivedMetricsCore2;
-                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
-                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+                    box_map = core2_box_map;
                     break;
 
                 case NEHALEM_EX:
-
                     eventHash = nehalemEX_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsNehalemEX;
-
-                    group_map = nehalemEX_group_map;
-                    group_help = nehalemEX_group_help;
-                    perfmon_numGroups = perfmon_numGroupsNehalemEX;
-
-                    counter_map = westmereEX_counter_map;
-                    perfmon_numCounters = perfmon_numCountersWestmereEX;
-
-                    initThreadArch = perfmon_init_nehalemEX;
-                    printDerivedMetrics = perfmon_printDerivedMetricsNehalemEX;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalemEX;
-                    logDerivedMetrics = perfmon_logDerivedMetricsNehalemEX;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalemEX;
+                    counter_map = nehalemEX_counter_map;
+                    perfmon_numCounters = perfmon_numCountersNehalemEX;
+                    box_map = nehalemEX_box_map;
                     break;
 
                 case WESTMERE_EX:
-
                     eventHash = westmereEX_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsWestmereEX;
-
-                    group_map = westmereEX_group_map;
-                    group_help = westmereEX_group_help;
-                    perfmon_numGroups = perfmon_numGroupsWestmereEX;
-
                     counter_map = westmereEX_counter_map;
                     perfmon_numCounters = perfmon_numCountersWestmereEX;
-
-                    initThreadArch = perfmon_init_westmereEX;
-                    printDerivedMetrics = perfmon_printDerivedMetricsWestmereEX;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmereEX;
-                    logDerivedMetrics = perfmon_logDerivedMetricsWestmereEX;
-                    perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
-                    perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_westmereEX;
+                    box_map = westmereEX_box_map;
                     break;
 
                 case NEHALEM_BLOOMFIELD:
-
                 case NEHALEM_LYNNFIELD:
-
-                    thermal_init(0);
-
+                case NEHALEM_LYNNFIELD_M:
                     eventHash = nehalem_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsNehalem;
-
-                    group_map = nehalem_group_map;
-                    group_help = nehalem_group_help;
-                    perfmon_numGroups = perfmon_numGroupsNehalem;
-
                     counter_map = nehalem_counter_map;
                     perfmon_numCounters = perfmon_numCountersNehalem;
+                    box_map = nehalem_box_map;
+                    break;
 
-                    initThreadArch = perfmon_init_nehalem;
-                    printDerivedMetrics = perfmon_printDerivedMetricsNehalem;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalem;
+                case NEHALEM_WESTMERE_M:
+                case NEHALEM_WESTMERE:
+                    eventHash = westmere_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsWestmere;
+                    counter_map = nehalem_counter_map;
+                    perfmon_numCounters = perfmon_numCountersNehalem;
+                    box_map = nehalem_box_map;
+                    break;
 
-                    logDerivedMetrics = perfmon_logDerivedMetricsNehalem;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+                case IVYBRIDGE_EP:
+                    pci_devices = ivybridgeEP_pci_devices;
+                    box_map = ivybridgeEP_box_map;
+                    eventHash = ivybridgeEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsIvybridgeEP;
+                    counter_map = ivybridgeEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersIvybridgeEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersIvybridgeEP;
+                    break;
+                case IVYBRIDGE:
+                    eventHash = ivybridge_arch_events;
+                    box_map = ivybridge_box_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+                    counter_map = ivybridge_counter_map;
+                    perfmon_numCounters = perfmon_numCountersIvybridge;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersIvybridge;
                     break;
 
-                case NEHALEM_WESTMERE_M:
+                case HASWELL_EP:
+                    eventHash = haswellEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsHaswellEP;
+                    counter_map = haswellEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersHaswellEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersHaswellEP;
+                    box_map = haswellEP_box_map;
+                    pci_devices = haswellEP_pci_devices;
+                    break;
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    eventHash = haswell_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+                    counter_map = haswell_counter_map;
+                    perfmon_numCounters = perfmon_numCountersHaswell;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersHaswell;
+                    box_map = haswell_box_map;
+                    break;
 
-                case NEHALEM_WESTMERE:
+                case SANDYBRIDGE_EP:
+                    pci_devices = sandybridgeEP_pci_devices;
+                    box_map = sandybridgeEP_box_map;
+                    eventHash = sandybridgeEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsSandybridgeEP;
+                    counter_map = sandybridgeEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersSandybridgeEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSandybridgeEP;
+                    break;
+                case SANDYBRIDGE:
+                    box_map = sandybridge_box_map;
+                    eventHash = sandybridge_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
+                    counter_map = sandybridge_counter_map;
+                    perfmon_numCounters = perfmon_numCountersSandybridge;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSandybridge;
+                    break;
 
-                    thermal_init(0);
+                case BROADWELL:
+                    box_map = broadwell_box_map;
+                    eventHash = broadwell_arch_events;
+                    counter_map = broadwell_counter_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsBroadwell;
+                    perfmon_numCounters = perfmon_numCountersBroadwell;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersBroadwell;
+                    break;
+                case BROADWELL_D:
+                    box_map = broadwelld_box_map;
+                    eventHash = broadwelld_arch_events;
+                    counter_map = broadwelld_counter_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsBroadwellD;
+                    perfmon_numCounters = perfmon_numCountersBroadwellD;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersBroadwellD;
+                    break;
+                case BROADWELL_E:
+                    box_map = broadwellEP_box_map;
+                    eventHash = broadwellEP_arch_events;
+                    counter_map = broadwellEP_counter_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsBroadwellEP;
+                    perfmon_numCounters = perfmon_numCountersBroadwellEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersBroadwellEP;
+                    break;
 
-                    eventHash = westmere_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsWestmere;
+                case SKYLAKE1:
+                case SKYLAKE2:
+                    box_map = skylake_box_map;
+                    eventHash = skylake_arch_events;
+                    counter_map = skylake_counter_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsSkylake;
+                    perfmon_numCounters = perfmon_numCountersSkylake;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSkylake;
+                    break;
 
-                    group_map = westmere_group_map;
-                    group_help = westmere_group_help;
-                    perfmon_numGroups = perfmon_numGroupsWestmere;
+                default:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
+                    break;
+            }
+            break;
 
-                    counter_map = nehalem_counter_map;
-                    perfmon_numCounters = perfmon_numCountersNehalem;
+        case MIC_FAMILY:
 
-                    initThreadArch = perfmon_init_nehalem;
-                    printDerivedMetrics = perfmon_printDerivedMetricsWestmere;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmere;
+            switch ( cpuid_info.model )
+            {
+                case XEON_PHI:
+                    eventHash = phi_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsPhi;
+                    counter_map = phi_counter_map;
+                    box_map = phi_box_map;
+                    perfmon_numCounters = perfmon_numCountersPhi;
+                    break;
 
-                    logDerivedMetrics = perfmon_logDerivedMetricsWestmere;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+                default:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
                     break;
+            }
+            break;
 
-                case IVYBRIDGE:
+        case K8_FAMILY:
+            eventHash = k8_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsK8;
+            counter_map = k10_counter_map;
+            box_map = k10_box_map;
+            perfmon_numCounters = perfmon_numCountersK10;
+            break;
 
-                case IVYBRIDGE_EP:
+        case K10_FAMILY:
+            eventHash = k10_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsK10;
+            counter_map = k10_counter_map;
+            box_map = k10_box_map;
+            perfmon_numCounters = perfmon_numCountersK10;
+            break;
 
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
-                    pci_init(socket_fd);
+        case K15_FAMILY:
+            eventHash = interlagos_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
+            counter_map = interlagos_counter_map;
+            box_map = interlagos_box_map;
+            perfmon_numCounters = perfmon_numCountersInterlagos;
+            break;
 
-                    eventHash = ivybridge_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+        case K16_FAMILY:
+            eventHash = kabini_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsKabini;
+            counter_map = kabini_counter_map;
+            box_map = kabini_box_map;
+            perfmon_numCounters = perfmon_numCountersKabini;
+           break;
 
-                    group_map = ivybridge_group_map;
-                    group_help = ivybridge_group_help;
-                    perfmon_numGroups = perfmon_numGroupsIvybridge;
+        default:
+            ERROR_PLAIN_PRINT(Unsupported Processor);
+            break;
+    }
+    return;
+}
 
-                    counter_map = ivybridge_counter_map;
-                    perfmon_numCounters = perfmon_numCountersIvybridge;
+void
+perfmon_init_funcs(int* init_power, int* init_temp)
+{
+    int initialize_power = FALSE;
+    int initialize_thermal = FALSE;
+    switch ( cpuid_info.family )
+    {
+        case P6_FAMILY:
 
-                    initThreadArch = perfmon_init_ivybridge;
-                    printDerivedMetrics = perfmon_printDerivedMetricsIvybridge;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesIvybridge;
+            switch ( cpuid_info.model )
+            {
+                case PENTIUM_M_BANIAS:
+                case PENTIUM_M_DOTHAN:
+                    initThreadArch = perfmon_init_pm;
+                    perfmon_startCountersThread = perfmon_startCountersThread_pm;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_pm;
+                    perfmon_readCountersThread = perfmon_readCountersThread_pm;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_pm;
+                    break;
 
-                    logDerivedMetrics = perfmon_logDerivedMetricsIvybridge;
-                    perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
-                    perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_ivybridge;
+                case ATOM_45:
+                case ATOM_32:
+                case ATOM_22:
+                case ATOM:
+                    initThreadArch = perfmon_init_core2;
+                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
                     break;
 
-                case HASWELL:
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                case ATOM_SILVERMONT_AIR:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_silvermont;
+                    perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
+                    perfmon_setupCountersThread = perfmon_setupCountersThread_silvermont;
+                    perfmon_readCountersThread = perfmon_readCountersThread_silvermont;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_silvermont;
+                    break;
 
-                case HASWELL_EX:
 
-                case HASWELL_M1:
+                case CORE_DUO:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
+                    break;
 
-                case HASWELL_M2:
+                case XEON_MP:
+                case CORE2_65:
+                case CORE2_45:
+                    initThreadArch = perfmon_init_core2;
+                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
+                    break;
 
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
+                case NEHALEM_EX:
+                    initThreadArch = perfmon_init_nehalemEX;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalemEX;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalemEX;
+                    break;
 
-                    eventHash = haswell_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+                case WESTMERE_EX:
+                    initThreadArch = perfmon_init_westmereEX;
+                    perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
+                    perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_westmereEX;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_westmereEX;
+                    break;
 
-                    group_map = haswell_group_map;
-                    group_help = haswell_group_help;
-                    perfmon_numGroups = perfmon_numGroupsHaswell;
+                case NEHALEM_BLOOMFIELD:
+                case NEHALEM_LYNNFIELD:
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_nehalem;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+                    break;
 
-                    counter_map = haswell_counter_map;
-                    perfmon_numCounters = perfmon_numCountersHaswell;
+                case NEHALEM_WESTMERE_M:
+                case NEHALEM_WESTMERE:
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_nehalem;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+                    break;
+
+                case IVYBRIDGE_EP:
+                case IVYBRIDGE:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_ivybridge;
+                    perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
+                    perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_ivybridge;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_ivybridge;
+                    break;
 
+                case HASWELL_EP:
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
                     initThreadArch = perfmon_init_haswell;
-                    printDerivedMetrics = perfmon_printDerivedMetricsHaswell;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesHaswell;
-                    logDerivedMetrics = perfmon_logDerivedMetricsHaswell;
                     perfmon_startCountersThread = perfmon_startCountersThread_haswell;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_haswell;
                     perfmon_readCountersThread = perfmon_readCountersThread_haswell;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_haswell;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_haswell;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_haswell;
                     break;
 
-                case SANDYBRIDGE:
-
                 case SANDYBRIDGE_EP:
-
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
-                    pci_init(socket_fd);
-
-                    eventHash = sandybridge_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
-
-                    group_map = sandybridge_group_map;
-                    group_help = sandybridge_group_help;
-                    perfmon_numGroups = perfmon_numGroupsSandybridge;
-
-                    counter_map = sandybridge_counter_map;
-                    perfmon_numCounters = perfmon_numCountersSandybridge;
-
+                case SANDYBRIDGE:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
                     initThreadArch = perfmon_init_sandybridge;
-                    printDerivedMetrics = perfmon_printDerivedMetricsSandybridge;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesSandybridge;
-                    logDerivedMetrics = perfmon_logDerivedMetricsSandybridge;
                     perfmon_startCountersThread = perfmon_startCountersThread_sandybridge;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_sandybridge;
                     perfmon_readCountersThread = perfmon_readCountersThread_sandybridge;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_sandybridge;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_sandybridge;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_sandybridge;
+                    break;
+
+                case BROADWELL:
+                case BROADWELL_E:
+                case BROADWELL_D:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_broadwell;
+                    perfmon_startCountersThread = perfmon_startCountersThread_broadwell;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_broadwell;
+                    perfmon_readCountersThread = perfmon_readCountersThread_broadwell;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_broadwell;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_broadwell;
+                    break;
+
+                case SKYLAKE1:
+                case SKYLAKE2:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_skylake;
+                    perfmon_startCountersThread = perfmon_startCountersThread_skylake;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_skylake;
+                    perfmon_readCountersThread = perfmon_readCountersThread_skylake;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_skylake;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_skylake;
                     break;
 
                 default:
@@ -1562,25 +1035,12 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             switch ( cpuid_info.model )
             {
                 case XEON_PHI:
-
-                    eventHash = phi_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsPhi;
-
-                    group_map = phi_group_map;
-                    group_help = phi_group_help;
-                    perfmon_numGroups = perfmon_numGroupsPhi;
-
-                    counter_map = phi_counter_map;
-                    perfmon_numCounters = perfmon_numCountersPhi;
-
                     initThreadArch = perfmon_init_phi;
-                    printDerivedMetrics = perfmon_printDerivedMetricsPhi;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesPhi;
-                    logDerivedMetrics = perfmon_logDerivedMetricsPhi;
                     perfmon_startCountersThread = perfmon_startCountersThread_phi;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_phi;
                     perfmon_readCountersThread = perfmon_readCountersThread_phi;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_phi;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_phi;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_phi;
                     break;
 
                 default:
@@ -1590,115 +1050,1640 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             break;
 
         case K8_FAMILY:
-            eventHash = k8_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsK8;
-
-            group_map = k8_group_map;
-            group_help = k8_group_help;
-            perfmon_numGroups = perfmon_numGroupsK8;
-
-            counter_map = k10_counter_map;
-            perfmon_numCounters = perfmon_numCountersK10;
-
             initThreadArch = perfmon_init_k10;
-            printDerivedMetrics = perfmon_printDerivedMetricsK8;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK8;
-            logDerivedMetrics = perfmon_logDerivedMetricsK8;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
             perfmon_readCountersThread = perfmon_readCountersThread_k10;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
             break;
 
         case K10_FAMILY:
-            eventHash = k10_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsK10;
-
-            group_map = k10_group_map;
-            group_help = k10_group_help;
-            perfmon_numGroups = perfmon_numGroupsK10;
-
-            counter_map = k10_counter_map;
-            perfmon_numCounters = perfmon_numCountersK10;
-
             initThreadArch = perfmon_init_k10;
-            printDerivedMetrics = perfmon_printDerivedMetricsK10;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK10;
-            logDerivedMetrics = perfmon_logDerivedMetricsK10;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
             perfmon_readCountersThread = perfmon_readCountersThread_k10;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
             break;
 
         case K15_FAMILY:
-            eventHash = interlagos_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
-
-            group_map = interlagos_group_map;
-            group_help = interlagos_group_help;
-            perfmon_numGroups = perfmon_numGroupsInterlagos;
-
-            counter_map = interlagos_counter_map;
-            perfmon_numCounters = perfmon_numCountersInterlagos;
-
             initThreadArch = perfmon_init_interlagos;
-            printDerivedMetrics = perfmon_printDerivedMetricsInterlagos;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesInterlagos;
-            logDerivedMetrics = perfmon_logDerivedMetricsInterlagos;
             perfmon_startCountersThread = perfmon_startCountersThread_interlagos;
             perfmon_stopCountersThread = perfmon_stopCountersThread_interlagos;
             perfmon_readCountersThread = perfmon_readCountersThread_interlagos;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_interlagos;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_interlagos;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_interlagos;
             break;
 
         case K16_FAMILY:
-            eventHash = kabini_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsKabini;
-
-            group_map = kabini_group_map;
-            group_help = kabini_group_help;
-            perfmon_numGroups = perfmon_numGroupsKabini;
-
-            counter_map = kabini_counter_map;
-            perfmon_numCounters = perfmon_numCountersKabini;
-
             initThreadArch = perfmon_init_kabini;
-            printDerivedMetrics = perfmon_printDerivedMetricsKabini;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesKabini;
-            logDerivedMetrics = perfmon_logDerivedMetricsKabini;
             perfmon_startCountersThread = perfmon_startCountersThread_kabini;
             perfmon_stopCountersThread = perfmon_stopCountersThread_kabini;
             perfmon_readCountersThread = perfmon_readCountersThread_kabini;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_kabini;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_kabini;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_kabini;
            break;
 
         default:
             ERROR_PLAIN_PRINT(Unsupported Processor);
             break;
     }
-
-
-    for (int i=0; i<perfmon_numThreads; i++)
-    {
-        initThread(i,threads[i]);
-    }
+    *init_power = initialize_power;
+    *init_temp = initialize_thermal;
 }
 
-void
-perfmon_finalize()
+
+int
+perfmon_init(int nrThreads, int threadsToCpu[])
 {
     int i;
+    int ret;
+    int initialize_power = FALSE;
+    int initialize_thermal = FALSE;
 
-    free(perfmon_threadData);
+    if (perfmon_initialized == 1)
+    {
+        return 0;
+    }
 
-    for (i=0; i<perfmon_numThreads; i++)
+    if (nrThreads <= 0)
     {
-        free(perfmon_threadState[i]);
+        ERROR_PRINT(Number of threads must be greater than 0 but only %d given,nrThreads);
+        return -EINVAL;
     }
-    free(perfmon_threadState);
-    msr_finalize();
-    pci_finalize();
-    accessClient_finalize(socket_fd);
-}
 
+    if (!lock_check())
+    {
+        ERROR_PLAIN_PRINT(Access to performance monitoring registers locked);
+        return -EINVAL;
+    }
+
+    if ((cpuid_info.family == 0) && (cpuid_info.model == 0))
+    {
+        ERROR_PLAIN_PRINT(Topology module not inialized. Needed to determine current CPU type);
+        return -ENODEV;
+    }
+
+    /* Check threadsToCpu array if only valid cpu_ids are listed */
+    if (groupSet != NULL)
+    {
+        /* TODO: Decision whether setting new thread count and adjust processorIds
+         *          or just exit like implemented now
+         */
+        return -EEXIST;
+    }
+
+    groupSet = (PerfmonGroupSet*) malloc(sizeof(PerfmonGroupSet));
+    if (groupSet == NULL)
+    {
+        ERROR_PLAIN_PRINT(Cannot allocate group descriptor);
+        return -ENOMEM;
+    }
+    groupSet->threads = (PerfmonThread*) malloc(nrThreads * sizeof(PerfmonThread));
+    if (groupSet->threads == NULL)
+    {
+        ERROR_PLAIN_PRINT(Cannot allocate set of threads);
+        free(groupSet);
+        return -ENOMEM;
+    }
+    groupSet->numberOfThreads = nrThreads;
+    groupSet->numberOfGroups = 0;
+    groupSet->numberOfActiveGroups = 0;
+    groupSet->groups = NULL;
+    groupSet->activeGroup = -1;
+
+    for(i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
+    for(i=0; i<MAX_NUM_THREADS; i++) tile_lock[i] = LOCK_INIT;
+
+    /* Initialize maps pointer to current architecture maps */
+    perfmon_init_maps();
+
+    /* Initialize access interface */
+    ret = HPMinit();
+    if (ret)
+    {
+        ERROR_PLAIN_PRINT(Cannot set access functions);
+        free(groupSet->threads);
+        free(groupSet);
+        exit(EXIT_FAILURE);
+        return ret;
+    }
+    timer_init();
+
+
+    /* Initialize function pointer to current architecture functions */
+    perfmon_init_funcs(&initialize_power, &initialize_thermal);
+
+    /* Store thread information and reset counters for processor*/
+    /* If the arch supports it, initialize power and thermal measurements */
+    for(i=0;i<nrThreads;i++)
+    {
+        if (HPMaddThread(threadsToCpu[i]) != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to performance counters);
+        }
+        groupSet->threads[i].thread_id = i;
+        groupSet->threads[i].processorId = threadsToCpu[i];
+
+        if (HPMcheck(MSR_DEV, threadsToCpu[i]) == 0)
+        {
+            fprintf(stderr, "Cannot get access to MSRs. Please check permissions to the MSRs\n");
+            exit(EXIT_FAILURE);
+        }
+        if (initialize_power == TRUE)
+        {
+            power_init(threadsToCpu[i]);
+        }
+        if (initialize_thermal == TRUE)
+        {
+            thermal_init(threadsToCpu[i]);
+        }
+        initThreadArch(threadsToCpu[i]);
+    }
+    perfmon_initialized = 1;
+    return 0;
+}
+
+void
+perfmon_finalize(void)
+{
+    int group, event;
+    int thread;
+    if (perfmon_initialized == 0)
+    {
+        return;
+    }
+    if (groupSet == NULL)
+    {
+        return;
+    }
+    for(group=0;group < groupSet->numberOfActiveGroups; group++)
+    {
+        
+        for (thread=0;thread< groupSet->numberOfThreads; thread++)
+        {
+            perfmon_finalizeCountersThread(thread, &(groupSet->groups[group]));
+        }
+        for (event=0;event < groupSet->groups[group].numberOfEvents; event++)
+        {
+            if (groupSet->groups[group].events[event].threadCounter)
+                free(groupSet->groups[group].events[event].threadCounter);
+        }
+        if (groupSet->groups[group].events != NULL)
+            free(groupSet->groups[group].events);
+        perfmon_delEventSet(group);
+        groupSet->groups[group].state = STATE_NONE;
+    }
+    if (groupSet->groups != NULL)
+        free(groupSet->groups);
+    if (groupSet->threads != NULL)
+        free(groupSet->threads);
+    groupSet->activeGroup = -1;
+    if (groupSet)
+        free(groupSet);
+    for (group=0; group < MAX_NUM_THREADS; group++)
+    {
+        memset(currentConfig[group], 0, NUM_PMC * sizeof(uint64_t));
+    }
+    if (markerResults != NULL)
+    {
+        perfmon_destroyMarkerResults();
+    }
+    power_finalize();
+    HPMfinalize();
+    perfmon_initialized = 0;
+    groupSet = NULL;
+    return;
+}
+
+int
+perfmon_addEventSet(char* eventCString)
+{
+    int i, j, err;
+    bstring eventBString;
+    struct bstrList* eventtokens;
+    PerfmonEventSet* eventSet;
+    PerfmonEventSetEntry* event;
+    char* cstringcopy;
+    Configuration_t config;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    config = get_configuration();
+
+    if (eventCString == NULL)
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Event string is empty. Trying environment variable LIKWID_EVENTS);
+        eventCString = getenv("LIKWID_EVENTS");
+        if (eventCString == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot read event string. Also event string from environment variable is empty);
+            return -EINVAL;
+        }
+    }
+
+    if (strchr(eventCString, '-') != NULL)
+    {
+        ERROR_PLAIN_PRINT(Event string contains invalid character -);
+        return -EINVAL;
+    }
+    if (strchr(eventCString, '.') != NULL)
+    {
+        ERROR_PLAIN_PRINT(Event string contains invalid character .);
+        return -EINVAL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        groupSet->groups = (PerfmonEventSet*) malloc(sizeof(PerfmonEventSet));
+        if (groupSet->groups == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot allocate initialize of event group list);
+            return -ENOMEM;
+        }
+        groupSet->numberOfGroups = 1;
+        groupSet->numberOfActiveGroups = 0;
+        groupSet->activeGroup = -1;
+
+        /* Only one group exists by now */
+        groupSet->groups[0].rdtscTime = 0;
+        groupSet->groups[0].runTime = 0;
+        groupSet->groups[0].numberOfEvents = 0;
+    }
+
+    if ((groupSet->numberOfActiveGroups > 0) && (groupSet->numberOfActiveGroups == groupSet->numberOfGroups))
+    {
+        groupSet->numberOfGroups++;
+        groupSet->groups = (PerfmonEventSet*)realloc(groupSet->groups, groupSet->numberOfGroups*sizeof(PerfmonEventSet));
+        if (groupSet->groups == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot allocate additional group);
+            return -ENOMEM;
+        }
+        groupSet->groups[groupSet->numberOfActiveGroups].rdtscTime = 0;
+        groupSet->groups[groupSet->numberOfActiveGroups].runTime = 0;
+        groupSet->groups[groupSet->numberOfActiveGroups].numberOfEvents = 0;
+        DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Allocating new group structure for group.);
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Currently %d groups of %d active,
+                    groupSet->numberOfActiveGroups+1,
+                    groupSet->numberOfGroups+1);
+
+    if (strchr(eventCString, ':') == NULL)
+    {
+        err = read_group(config->groupPath, cpuid_info.short_name,
+                         eventCString,
+                         &groupSet->groups[groupSet->numberOfActiveGroups].group);
+        if (err)
+        {
+            ERROR_PRINT(Cannot read performance group %s, eventCString);
+            return err;
+        }
+    }
+    else
+    {
+        err = custom_group(eventCString, &groupSet->groups[groupSet->numberOfActiveGroups].group);
+        if (err)
+        {
+            ERROR_PRINT(Cannot transform %s to performance group, eventCString);
+            return err;
+        }
+    }
+    char * evstr = get_eventStr(&groupSet->groups[groupSet->numberOfActiveGroups].group);
+    eventBString = bfromcstr(evstr);
+    eventtokens = bsplit(eventBString,',');
+    free(evstr);
+    bdestroy(eventBString);
+
+    eventSet = &(groupSet->groups[groupSet->numberOfActiveGroups]);
+    eventSet->events = (PerfmonEventSetEntry*) malloc(eventtokens->qty * sizeof(PerfmonEventSetEntry));
+    if (eventSet->events == NULL)
+    {
+        ERROR_PRINT(Cannot allocate event list for group %d\n, groupSet->numberOfActiveGroups);
+        return -ENOMEM;
+    }
+    eventSet->numberOfEvents = 0;
+    eventSet->regTypeMask = 0x0ULL;
+
+
+    int forceOverwrite = 0;
+    if (getenv("LIKWID_FORCE") != NULL)
+    {
+        forceOverwrite = atoi(getenv("LIKWID_FORCE"));
+    }
+    for(i=0;i<eventtokens->qty;i++)
+    {
+        event = &(eventSet->events[i]);
+        struct bstrList* subtokens = bsplit(eventtokens->entry[i],':');
+        if (subtokens->qty < 2)
+        {
+            ERROR_PRINT(Cannot parse event descriptor %s, bdata(eventtokens->entry[i]));
+            bstrListDestroy(subtokens);
+            continue;
+        }
+        else
+        {
+            if (!getIndexAndType(subtokens->entry[1], &event->index, &event->type, forceOverwrite))
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO, Counter register %s not supported or PCI device not available,
+                            bdata(subtokens->entry[1]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+
+            if (!getEvent(subtokens->entry[0], subtokens->entry[1], &event->event))
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO, Event %s not found for current architecture,
+                     bdata(subtokens->entry[0]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+
+            if (!checkCounter(subtokens->entry[1], event->event.limit))
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO, Register %s not allowed for event %s,
+                     bdata(subtokens->entry[1]),bdata(subtokens->entry[0]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+            if (parseOptions(subtokens, &event->event, event->index) < 0)
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO, Cannot parse options in %s, bdata(eventtokens->entry[i]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+
+            eventSet->regTypeMask |= REG_TYPE_MASK(event->type);
+past_checks:
+            event->threadCounter = (PerfmonCounter*) malloc(
+                groupSet->numberOfThreads * sizeof(PerfmonCounter));
+
+            if (event->threadCounter == NULL)
+            {
+                ERROR_PRINT(Cannot allocate counter for all threads in group %d,groupSet->numberOfActiveGroups);
+                //bstrListDestroy(subtokens);
+                continue;
+            }
+            for(j=0;j<groupSet->numberOfThreads;j++)
+            {
+                event->threadCounter[j].counterData = 0;
+                event->threadCounter[j].startData = 0;
+                event->threadCounter[j].fullResult = 0.0;
+                event->threadCounter[j].lastResult = 0.0;
+                event->threadCounter[j].overflows = 0;
+                event->threadCounter[j].init = FALSE;
+            }
+            eventSet->numberOfEvents++;
+
+            if (event->type != NOTYPE)
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO,
+                        Added event %s for counter %s to group %d,
+                        event->event.name,
+                        counter_map[event->index].key,
+                        groupSet->numberOfActiveGroups);
+            }
+        }
+        bstrListDestroy(subtokens);
+    }
+    bstrListDestroy(eventtokens);
+    if ((eventSet->numberOfEvents > 0) && (eventSet->regTypeMask != 0x0ULL))
+    {
+        eventSet->state = STATE_NONE;
+        groupSet->numberOfActiveGroups++;
+        return groupSet->numberOfActiveGroups-1;
+    }
+    else
+    {
+        fprintf(stderr,"No event in given event string can be configured\n");
+        return -EINVAL;
+    }
+}
+
+void
+perfmon_delEventSet(int groupID)
+{
+    if (groupID >= groupSet->numberOfGroups || groupID < 0)
+        return;
+    return_group(&groupSet->groups[groupID].group);
+    return;
+}
+
+int
+__perfmon_setupCountersThread(int thread_id, int groupId)
+{
+    int i;
+    if (groupId >= groupSet->numberOfActiveGroups)
+    {
+        ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+        return -ENOENT;
+    }
+
+    CHECK_AND_RETURN_ERROR(perfmon_setupCountersThread(thread_id, &groupSet->groups[groupId]),
+            Setup of counters failed);
+
+    groupSet->activeGroup = groupId;
+    return 0;
+}
+
+int
+perfmon_setupCounters(int groupId)
+{
+    int i;
+    int ret = 0;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (unlikely(groupSet == NULL))
+    {
+        return -EINVAL;
+    }
+    if (groupId >= groupSet->numberOfActiveGroups)
+    {
+        ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+        return -ENOENT;
+    }
+    
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        ret = __perfmon_setupCountersThread(groupSet->threads[i].thread_id, groupId);
+        if (ret != 0)
+        {
+            return ret;
+        }
+    }
+    groupSet->groups[groupId].state = STATE_SETUP;
+    return 0;
+}
+
+int
+__perfmon_startCounters(int groupId)
+{
+    int i = 0;
+    int ret = 0;
+    if (groupSet->groups[groupId].state != STATE_SETUP)
+    {
+        return -EINVAL;
+    }
+    for(;i<groupSet->numberOfThreads;i++)
+    {
+        ret = perfmon_startCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -groupSet->threads[i].thread_id-1;
+        }
+    }
+    groupSet->groups[groupId].state = STATE_START;
+    timer_start(&groupSet->groups[groupId].timer);
+    return 0;
+}
+
+int perfmon_startCounters(void)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (unlikely(groupSet == NULL))
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (groupSet->activeGroup < 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot find group to start);
+        return -EINVAL;
+    }
+    return __perfmon_startCounters(groupSet->activeGroup);
+}
+
+int perfmon_startGroupCounters(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (unlikely(groupSet == NULL))
+    {
+        return -EINVAL;
+    }
+    if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    else
+    {
+        ERROR_PLAIN_PRINT(Cannot find group to start);
+        return -EINVAL;
+    }
+    return __perfmon_startCounters(groupId);
+}
+
+int
+__perfmon_stopCounters(int groupId)
+{
+    int i = 0;
+    int j = 0;
+    int ret = 0;
+    double result = 0.0;
+
+    timer_stop(&groupSet->groups[groupId].timer);
+
+    for (i = 0; i<groupSet->numberOfThreads; i++)
+    {
+        ret = perfmon_stopCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -groupSet->threads[i].thread_id-1;
+        }
+    }
+
+    for (i=0; i<perfmon_getNumberOfEvents(groupId); i++)
+    {
+        for (j=0; j<perfmon_getNumberOfThreads(); j++)
+        {
+            result = calculateResult(groupId, i, j);
+            groupSet->groups[groupId].events[i].threadCounter[j].lastResult = result;
+            groupSet->groups[groupId].events[i].threadCounter[j].fullResult += result;
+        }
+    }
+    groupSet->groups[groupId].state = STATE_SETUP;
+    groupSet->groups[groupId].rdtscTime =
+                timer_print(&groupSet->groups[groupId].timer);
+    groupSet->groups[groupId].runTime += groupSet->groups[groupId].rdtscTime;
+    return 0;
+}
+
+int perfmon_stopCounters(void)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (unlikely(groupSet == NULL))
+    {
+        return -EINVAL;
+    }
+    if (groupSet->activeGroup < 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot find group to start);
+        return -EINVAL;
+    }
+    if (groupSet->groups[groupSet->activeGroup].state != STATE_START)
+    {
+        return -EINVAL;
+    }
+    return __perfmon_stopCounters(groupSet->activeGroup);
+}
+
+int perfmon_stopGroupCounters(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (unlikely(groupSet == NULL))
+    {
+        return -EINVAL;
+    }
+    if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    else
+    {
+        ERROR_PLAIN_PRINT(Cannot find group to start);
+        return -EINVAL;
+    }
+    if (groupSet->groups[groupId].state != STATE_START)
+    {
+        return -EINVAL;
+    }
+    return __perfmon_stopCounters(groupId);
+}
+
+int
+__perfmon_readCounters(int groupId, int threadId)
+{
+    int ret = 0;
+    int i = 0, j = 0;
+    double result = 0.0;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups)) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (groupSet->groups[groupId].state != STATE_START)
+    {
+        return -EINVAL;
+    }
+    timer_stop(&groupSet->groups[groupId].timer);
+    groupSet->groups[groupId].rdtscTime = timer_print(&groupSet->groups[groupId].timer);
+    groupSet->groups[groupId].runTime += groupSet->groups[groupId].rdtscTime;
+    if (threadId == -1)
+    {
+        for (threadId = 0; threadId<groupSet->numberOfThreads; threadId++)
+        {
+            ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+            if (ret)
+            {
+                return -threadId-1;
+            }
+            for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
+            {
+                groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
+                groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
+                groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+            }
+        }
+    }
+    else if ((threadId >= 0) && (threadId < groupSet->numberOfThreads))
+    {
+        ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -threadId-1;
+        }
+        for (j=0; j < groupSet->groups[groupId].numberOfEvents; j++)
+        {
+            groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = (double)calculateResult(groupId, j, threadId);
+            groupSet->groups[groupId].events[j].threadCounter[threadId].lastResult = result;
+            groupSet->groups[groupId].events[j].threadCounter[threadId].fullResult += result;
+        }
+}
+    timer_start(&groupSet->groups[groupId].timer);
+    return 0;
+}
+
+int perfmon_readCounters(void)
+{
+    return __perfmon_readCounters(-1,-1);
+}
+
+int perfmon_readCountersCpu(int cpu_id)
+{
+    int i;
+    int thread_id = -1;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        if (groupSet->threads[i].processorId == cpu_id)
+        {
+            thread_id = groupSet->threads[i].thread_id;
+            break;
+        }
+    }
+    if (thread_id < 0)
+    {
+        ERROR_PRINT(Failed to read counters for CPU %d, cpu_id);
+        return -thread_id;
+    }
+    i = __perfmon_readCounters(groupSet->activeGroup, thread_id);
+    return i;
+}
+
+int perfmon_readGroupCounters(int groupId)
+{
+    return __perfmon_readCounters(groupId, -1);
+}
+int perfmon_readGroupThreadCounters(int groupId, int threadId)
+{
+    return __perfmon_readCounters(groupId, threadId);
+}
+
+
+double
+perfmon_getResult(int groupId, int eventId, int threadId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return 0;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return 0;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return 0;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (eventId >= groupSet->groups[groupId].numberOfEvents)
+    {
+        printf("ERROR: EventID greater than defined events\n");
+        return 0;
+    }
+    if (threadId >= groupSet->numberOfThreads)
+    {
+        printf("ERROR: ThreadID greater than defined threads\n");
+        return 0;
+    }
+
+    if (groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult == 0)
+    {
+        return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
+    }
+    return groupSet->groups[groupId].events[eventId].threadCounter[threadId].fullResult;
+}
+
+double
+perfmon_getLastResult(int groupId, int eventId, int threadId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return 0;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return 0;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return 0;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (eventId >= groupSet->groups[groupId].numberOfEvents)
+    {
+        printf("ERROR: EventID greater than defined events\n");
+        return 0;
+    }
+    if (threadId >= groupSet->numberOfThreads)
+    {
+        printf("ERROR: ThreadID greater than defined threads\n");
+        return 0;
+    }
+
+    return groupSet->groups[groupId].events[eventId].threadCounter[threadId].lastResult;
+}
+
+double
+perfmon_getMetric(int groupId, int metricId, int threadId)
+{
+    int e = 0;
+    double result = 0;
+    CounterList clist;
+    char* teststr = malloc(1024 * sizeof(char));
+    if (unlikely(groupSet == NULL))
+    {
+        return 0;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return 0;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return 0;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (groupSet->groups[groupId].group.nmetrics == 0)
+    {
+        return 0.0;
+    }
+    if ((metricId < 0) || (metricId >= groupSet->groups[groupId].group.nmetrics))
+    {
+        return 0.0;
+    }
+    timer_init();
+    init_clist(&clist);
+    for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
+    {
+        add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+                     perfmon_getResult(groupId, e, threadId));
+    }
+    add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
+    if (e < 0)
+    {
+        result = 0.0;
+        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+    }
+    destroy_clist(&clist);
+    return result;
+}
+
+double
+perfmon_getLastMetric(int groupId, int metricId, int threadId)
+{
+    int e = 0;
+    double result = 0;
+    CounterList clist;
+    if (unlikely(groupSet == NULL))
+    {
+        return 0;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return 0;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return 0;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (groupSet->groups[groupId].group.nmetrics == 0)
+    {
+        return 0.0;
+    }
+    if ((metricId < 0) || (metricId >= groupSet->groups[groupId].group.nmetrics))
+    {
+        return 0.0;
+    }
+    timer_init();
+    init_clist(&clist);
+    for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
+    {
+        add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+                     perfmon_getLastResult(groupId, e, threadId));
+    }
+    add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
+    if (e < 0)
+    {
+        result = 0.0;
+        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
+    }
+    destroy_clist(&clist);
+    return result;
+}
+
+
+int __perfmon_switchActiveGroupThread(int thread_id, int new_group)
+{
+    int ret = 0;
+    int i = 0;
+    GroupState state;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+
+    timer_stop(&groupSet->groups[groupSet->activeGroup].timer);
+    groupSet->groups[groupSet->activeGroup].rdtscTime =
+                timer_print(&groupSet->groups[groupSet->activeGroup].timer);
+    groupSet->groups[groupSet->activeGroup].runTime += groupSet->groups[groupSet->activeGroup].rdtscTime;
+    state = groupSet->groups[groupSet->activeGroup].state;
+
+    if (state == STATE_START)
+    {
+        ret = perfmon_stopCounters();
+    }
+
+    if (state == STATE_SETUP)
+    {
+        for(i=0; i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
+        {
+            groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].init = FALSE;
+        }
+    }
+    ret = perfmon_setupCounters(new_group);
+    if (ret != 0)
+    {
+        return ret;
+    }
+    if (groupSet->groups[groupSet->activeGroup].state == STATE_SETUP)
+    {
+        ret = perfmon_startCounters();
+        if (ret != 0)
+        {
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int
+perfmon_switchActiveGroup(int new_group)
+{
+    int i = 0;
+    int ret = 0;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        ret = __perfmon_switchActiveGroupThread(groupSet->threads[i].thread_id, new_group);
+        if (ret != 0)
+        {
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int
+perfmon_getNumberOfGroups(void)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    return groupSet->numberOfActiveGroups;
+}
+
+int
+perfmon_getIdOfActiveGroup(void)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    return groupSet->activeGroup;
+}
+
+int
+perfmon_getNumberOfThreads(void)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    return groupSet->numberOfThreads;
+}
+
+int
+perfmon_getNumberOfEvents(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].numberOfEvents;
+}
+
+double
+perfmon_getTimeOfGroup(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].runTime;
+}
+
+double
+perfmon_getLastTimeOfGroup(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].rdtscTime;
+}
+
+uint64_t
+perfmon_getMaxCounterValue(RegisterType type)
+{
+    int width = 48;
+    uint64_t tmp = 0x0ULL;
+    if (box_map && (box_map[type].regWidth > 0))
+    {
+        width = box_map[type].regWidth;
+    }
+    for(int i=0;i<width;i++)
+    {
+        tmp |= (1ULL<<i);
+    }
+    return tmp;
+}
+
+char* perfmon_getEventName(int groupId, int eventId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if ((groupSet->groups[groupId].group.nevents == 0) ||
+        (eventId > groupSet->groups[groupId].group.nevents))
+    {
+        return NULL;
+    }
+    return groupSet->groups[groupId].group.events[eventId];
+}
+
+char* perfmon_getCounterName(int groupId, int eventId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if ((groupSet->groups[groupId].group.nevents == 0) ||
+        (eventId > groupSet->groups[groupId].group.nevents))
+    {
+        return NULL;
+    }
+    return groupSet->groups[groupId].group.counters[eventId];
+}
+
+char* perfmon_getMetricName(int groupId, int metricId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (groupSet->groups[groupId].group.nmetrics == 0)
+    {
+        return NULL;
+    }
+    return groupSet->groups[groupId].group.metricnames[metricId];
+}
+
+char* perfmon_getGroupName(int groupId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].group.groupname;
+}
+
+char* perfmon_getGroupInfoShort(int groupId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].group.shortinfo;
+}
+
+char* perfmon_getGroupInfoLong(int groupId)
+{
+    if (unlikely(groupSet == NULL))
+    {
+        return NULL;
+    }
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (groupSet->numberOfActiveGroups == 0)
+    {
+        return NULL;
+    }
+    if ((groupId < 0) && (groupSet->activeGroup >= 0))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].group.longinfo;
+}
+
+int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos)
+{
+    int ret = 0;
+    init_configuration();
+    Configuration_t config = get_configuration();
+    ret = get_groups(config->groupPath, cpuid_info.short_name, groups, shortinfos, longinfos);
+    return ret;
+}
+
+void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos)
+{
+    return_groups(nrgroups, groups, shortinfos, longinfos);
+}
+
+int perfmon_getNumberOfMetrics(int groupId)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].group.nmetrics;
+}
+
+void perfmon_printMarkerResults()
+{
+    int i = 0, j = 0, k = 0;
+    for (i=0; i<markerRegions; i++)
+    {
+        printf("Region %d : %s\n", i, bdata(markerResults[i].tag));
+        printf("Group %d\n", markerResults[i].groupID);
+        for (j=0;j<markerResults[i].threadCount; j++)
+        {
+            printf("Thread %d on CPU %d\n", j, markerResults[i].cpulist[j]);
+            printf("\t Measurement time %f sec\n", markerResults[i].time[j]);
+            printf("\t Call count %d\n", markerResults[i].count[j]);
+            for(k=0;k<markerResults[i].eventCount;k++)
+            {
+                printf("\t Event %d : %f\n", k, markerResults[i].counters[j][k]);
+            }
+        }
+    }
+}
+
+int perfmon_getNumberOfRegions()
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    return markerRegions;
+}
+
+
+int perfmon_getGroupOfRegion(int region)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    return markerResults[region].groupID;
+}
+
+char* perfmon_getTagOfRegion(int region)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return NULL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return NULL;
+    }
+    if (markerResults == NULL)
+    {
+        return NULL;
+    }
+    return bdata(markerResults[region].tag);
+}
+
+
+int perfmon_getEventsOfRegion(int region)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    return markerResults[region].eventCount;
+}
+
+int perfmon_getMetricsOfRegion(int region)
+{
+    
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    return perfmon_getNumberOfMetrics(markerResults[region].groupID);
+}
+
+
+int perfmon_getThreadsOfRegion(int region)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    return markerResults[region].threadCount;
+}
+
+int perfmon_getCpulistOfRegion(int region, int count, int* cpulist)
+{
+    int i;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    if (cpulist == NULL)
+    {
+        return -EINVAL;
+    }
+    for (i=0; i< MIN(count, markerResults[region].threadCount); i++)
+    {
+        cpulist[i] = markerResults[region].cpulist[i];
+    }
+    return MIN(count, markerResults[region].threadCount);
+}
+
+
+double perfmon_getTimeOfRegion(int region, int thread)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (thread < 0 || thread >= groupSet->numberOfThreads)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL || markerResults[region].time == NULL)
+    {
+        return 0.0;
+    }
+    return markerResults[region].time[thread];
+}
+
+int perfmon_getCountOfRegion(int region, int thread)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (thread < 0 || thread >= groupSet->numberOfThreads)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL || markerResults[region].count == NULL)
+    {
+        return 0.0;
+    }
+    return markerResults[region].count[thread];
+}
+
+double perfmon_getResultOfRegionThread(int region, int event, int thread)
+{
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0;
+    }
+    if (thread < 0 || thread >= markerResults[region].threadCount)
+    {
+        return -EINVAL;
+    }
+    if (event < 0 || event >= markerResults[region].eventCount)
+    {
+        return -EINVAL;
+    }
+    if (markerResults[region].counters[thread] == NULL)
+    {
+        return 0.0;
+    }
+    return markerResults[region].counters[thread][event];
+}
+
+double
+perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
+{
+    int e = 0, err = 0;
+    double result = 0.0;
+    CounterList clist;
+    if (perfmon_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
+        return -EINVAL;
+    }
+    if (region < 0 || region >= markerRegions)
+    {
+        return -EINVAL;
+    }
+    if (markerResults == NULL)
+    {
+        return 0.0;
+    }
+    if (threadId < 0 || threadId >= markerResults[region].threadCount)
+    {
+        return -EINVAL;
+    }
+    if (metricId < 0 || metricId >= groupSet->groups[markerResults[region].groupID].group.nmetrics)
+    {
+        return -EINVAL;
+    }
+    timer_init();
+    init_clist(&clist);
+    for (e=0;e<markerResults[region].eventCount;e++)
+    {
+        err = add_to_clist(&clist,
+                     groupSet->groups[markerResults[region].groupID].group.counters[e],
+                     perfmon_getResultOfRegionThread(region, e, threadId));
+        if (err)
+        {
+            printf("Cannot add counter %s to counter list for metric calculation\n",
+                    counter_map[groupSet->groups[markerResults[region].groupID].events[e].index].key);
+            destroy_clist(&clist);
+            return 0;
+        }
+    }
+    add_to_clist(&clist, "time", perfmon_getTimeOfRegion(region, threadId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    err = calc_metric(groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId], &clist, &result);
+    if (err < 0)
+    {
+        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId]);
+    }
+    destroy_clist(&clist);
+    return result;
+}
+
+int perfmon_readMarkerFile(const char* filename)
+{
+    FILE* fp = NULL;
+    int i = 0;
+    char buf[2048];
+    buf[0] = '\0';
+    char *ptr = NULL;
+    int cpus = 0, groups = 0, regions = 0;
+    
+    if (filename == NULL)
+    {
+        return -EINVAL;
+    }
+    if (access(filename, R_OK))
+    {
+        return -EINVAL;
+    }
+    fp = fopen(filename, "r");
+    if (fp == NULL)
+    {
+        fprintf(stderr, "Error opening file %s\n", filename);
+    }
+    ptr = fgets(buf, sizeof(buf), fp);
+    sscanf(buf, "%d %d %d", &cpus, &regions, &groups);
+    //markerResults = malloc(regions * sizeof(LikwidResults));
+    markerResults = realloc(markerResults, regions * sizeof(LikwidResults));
+    if (markerResults == NULL)
+    {
+        fprintf(stderr, "Failed to allocate %lu bytes for the marker results storage\n", regions * sizeof(LikwidResults));
+        return -ENOMEM;
+    }
+    int* regionCPUs = (int*)malloc(regions * sizeof(int));
+    if (regionCPUs == NULL)
+    {
+        fprintf(stderr, "Failed to allocate %lu bytes for temporal cpu count storage\n", regions * sizeof(int));
+        return -ENOMEM;
+    }
+    markerRegions = regions;
+    groupSet->numberOfThreads = cpus;
+    for ( uint32_t i=0; i < regions; i++ )
+    {
+        regionCPUs[i] = 0;
+        markerResults[i].threadCount = cpus;
+        markerResults[i].time = (double*) malloc(cpus * sizeof(double));
+        if (!markerResults[i].time)
+        {
+            fprintf(stderr, "Failed to allocate %lu bytes for the time storage\n", cpus * sizeof(double));
+            break;
+        }
+        markerResults[i].count = (uint32_t*) malloc(cpus * sizeof(uint32_t));
+        if (!markerResults[i].count)
+        {
+            fprintf(stderr, "Failed to allocate %lu bytes for the count storage\n", cpus * sizeof(uint32_t));
+            break;
+        }
+        markerResults[i].cpulist = (int*) malloc(cpus * sizeof(int));
+        if (!markerResults[i].count)
+        {
+            fprintf(stderr, "Failed to allocate %lu bytes for the cpulist storage\n", cpus * sizeof(int));
+            break;
+        }
+        markerResults[i].counters = (double**) malloc(cpus * sizeof(double*));
+        if (!markerResults[i].counters)
+        {
+            fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage\n", cpus * sizeof(double*));
+            break;
+        }
+    }
+    while (fgets(buf, sizeof(buf), fp))
+    {
+        if (strchr(buf,':'))
+        {
+            int regionid = 0, groupid = 0;
+            char regiontag[100];
+            regiontag[0] = '\0';
+            sscanf(buf, "%d:%s-%d", &regionid, regiontag, &groupid);
+            snprintf(regiontag, strlen(buf)-4, "%s", &(buf[2]));
+            markerResults[regionid].groupID = groupid;
+            markerResults[regionid].tag = bfromcstr(regiontag);
+        }
+        else
+        {
+            int regionid = 0, groupid = 0, cpu = 0, count = 0, nevents = 0;
+            int cpuidx = 0, eventidx = 0;
+            double time = 0;
+            char remain[1024];
+            remain[0] = '\0';
+            sscanf(buf, "%d %d %d %d %lf %d %[^\t\n]", &regionid, &groupid, &cpu, &count, &time, &nevents, remain);
+            if (cpu >= 0)
+            {
+                cpuidx = regionCPUs[regionid];
+                markerResults[regionid].cpulist[cpuidx] = cpu;
+                markerResults[regionid].eventCount = nevents;
+                markerResults[regionid].time[cpuidx] = time;
+                markerResults[regionid].count[cpuidx] = count;
+                markerResults[regionid].counters[cpuidx] = malloc(nevents * sizeof(double));
+
+                eventidx = 0;
+                ptr = strtok(remain, " ");
+                while (ptr != NULL && eventidx < nevents)
+                {
+                    sscanf(ptr, "%lf", &(markerResults[regionid].counters[cpuidx][eventidx]));
+                    ptr = strtok(NULL, " ");
+                    eventidx++;
+                }
+                regionCPUs[regionid]++;
+            }
+        }
+    }
+    for ( uint32_t i=0; i < regions; i++ )
+    {
+        markerResults[i].threadCount = regionCPUs[i];
+    }
+    free(regionCPUs);
+    fclose(fp);
+    return 0;
+}
+
+void perfmon_destroyMarkerResults()
+{
+    int i = 0, j = 0;
+    if (markerResults != NULL)
+    {
+        for (i = 0; i < markerRegions; i++)
+        {
+            free(markerResults[i].time);
+            free(markerResults[i].count);
+            free(markerResults[i].cpulist);
+            for (j = 0; j < markerResults[i].threadCount; j++)
+            {
+                free(markerResults[i].counters[j]);
+            }
+            free(markerResults[i].counters);
+            bdestroy(markerResults[i].tag);
+        }
+        free(markerResults);
+    }
+}
diff --git a/src/perfmon_perf.c b/src/perfmon_perf.c
new file mode 100644
index 0000000..17a56c0
--- /dev/null
+++ b/src/perfmon_perf.c
@@ -0,0 +1,260 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_perf.c
+ *
+ *      Description:  Example perfmon module for software events through perf_event
+ *                    Currently not integrated in perfmon.
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+
+#include <topology.h>
+#include <error.h>
+#include <perfmon.h>
+#include <perfmon_perf.h>
+
+static int* cpu_event_fds[MAX_NUM_THREADS] = { NULL };
+
+const uint64_t configList[MAX_SW_EVENTS] = {
+    [0x00] = PERF_COUNT_SW_CPU_CLOCK,
+    [0x01] = PERF_COUNT_SW_TASK_CLOCK,
+    [0x02] = PERF_COUNT_SW_PAGE_FAULTS,
+    [0x03] = PERF_COUNT_SW_CONTEXT_SWITCHES,
+    [0x04] = PERF_COUNT_SW_CPU_MIGRATIONS,
+    [0x05] = PERF_COUNT_SW_PAGE_FAULTS_MIN,
+    [0x06] = PERF_COUNT_SW_PAGE_FAULTS_MAJ,
+    [0x07] = PERF_COUNT_SW_ALIGNMENT_FAULTS,
+    [0x08] = PERF_COUNT_SW_EMULATION_FAULTS,
+};
+
+static long
+perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+                int cpu, int group_fd, unsigned long flags)
+{
+    int ret;
+
+    ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+                   group_fd, flags);
+    return ret;
+}
+
+int init_perf_event(int cpu_id)
+{
+    if (cpu_event_fds[cpu_id] == NULL)
+    {
+        cpu_event_fds[cpu_id] = (int*) malloc(MAX_SW_EVENTS * sizeof(int));
+        if (cpu_event_fds[cpu_id] == NULL)
+        {
+            return -ENOMEM;
+        }
+        memset(cpu_event_fds[cpu_id], -1, MAX_SW_EVENTS * sizeof(int));
+    }
+    return 0;
+}
+
+int setup_perf_event(int cpu_id, PerfmonEvent* event)
+{
+    struct perf_event_attr attr;
+    if (event == NULL)
+    {
+        return -EINVAL;
+    }
+    if (cpu_event_fds[cpu_id] == NULL)
+    {
+        return -EFAULT;
+    }
+    if (cpu_event_fds[cpu_id][event->umask] != -1)
+    {
+        return 0;
+    }
+    memset(&attr, 0, sizeof(struct perf_event_attr));
+    attr.type = PERF_TYPE_SOFTWARE;
+    attr.size = sizeof(struct perf_event_attr);
+    attr.config = configList[event->umask];
+    attr.exclude_kernel = 1;
+    attr.exclude_hv = 1;
+    attr.disabled = 1;
+    attr.inherit = 1;
+    if (event->numberOfOptions > 0)
+    {
+        for(int j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_COUNT_KERNEL:
+                    attr.exclude_kernel = 0;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    cpu_event_fds[cpu_id][event->umask] = perf_event_open(&attr, 0, cpu_id, -1, 0);
+    if (cpu_event_fds[cpu_id][event->umask] < 0)
+    {
+        printf("Setup of event %llu failed\n", event->umask);
+        return -EFAULT;
+    }
+    return 0;
+}
+
+int read_perf_event(int cpu_id, uint64_t eventID, uint64_t *data)
+{
+    int ret = 0;
+    long long tmp = 0;
+    *data = 0x0ULL;
+    if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+    {
+        ret = read(cpu_event_fds[cpu_id][eventID], &tmp, sizeof(long long));
+        if (ret == sizeof(long long))
+        {
+            *data = (uint64_t) tmp;
+        }
+    }
+    else
+    {
+        printf("FD for event %llu not initialized\n", eventID);
+        return -ENODEV;
+    }
+    return 0;
+}
+
+int stop_perf_event(int cpu_id, uint64_t eventID)
+{
+    if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+    {
+        ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_DISABLE, 0);
+    }
+    else
+    {
+        return -ENODEV;
+    }
+    return 0;
+}
+
+int stop_all_perf_event(int cpu_id)
+{
+    if (cpu_event_fds[cpu_id] != NULL)
+    {
+        for (int i = 0; i< MAX_SW_EVENTS; i++)
+        {
+            if (cpu_event_fds[cpu_id][i] != -1)
+            {
+                stop_perf_event(cpu_id, i);
+            }
+        }
+    }
+    return 0;
+}
+
+int clear_perf_event(int cpu_id, uint64_t eventID)
+{
+    if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+    {
+        ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_RESET, 0);
+    }
+    else
+    {
+        return -ENODEV;
+    }
+    return 0;
+}
+
+int clear_all_perf_event(int cpu_id)
+{
+    if (cpu_event_fds[cpu_id] != NULL)
+    {
+        for (int i = 0; i< MAX_SW_EVENTS; i++)
+        {
+            if (cpu_event_fds[cpu_id][i] != -1)
+            {
+                clear_perf_event(cpu_id, i);
+            }
+        }
+    }
+    return 0;
+}
+
+int start_perf_event(int cpu_id, uint64_t eventID)
+{
+    if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+    {
+        ioctl(cpu_event_fds[cpu_id][eventID], PERF_EVENT_IOC_ENABLE, 0);
+    }
+    else
+    {
+        return -ENODEV;
+    }
+    return 0;
+}
+
+int start_all_perf_event(int cpu_id)
+{
+    if (cpu_event_fds[cpu_id] != NULL)
+    {
+        for (int i = 0; i< MAX_SW_EVENTS; i++)
+        {
+            if (cpu_event_fds[cpu_id][i] != -1)
+            {
+                start_perf_event(cpu_id, i);
+            }
+        }
+    }
+    return 0;
+}
+
+int close_perf_event(int cpu_id, uint64_t eventID)
+{
+    if ((cpu_event_fds[cpu_id] != NULL) && (cpu_event_fds[cpu_id][eventID] != -1))
+    {
+        close(cpu_event_fds[cpu_id][eventID]);
+        cpu_event_fds[cpu_id][eventID] = -1;
+    }
+    return 0;
+}
+
+int finalize_perf_event(int cpu_id)
+{
+    if (cpu_event_fds[cpu_id] != NULL)
+    {
+        for (int i = 0; i< MAX_SW_EVENTS; i++)
+        {
+            if (cpu_event_fds[cpu_id][i] != -1)
+            {
+                close_perf_event(cpu_id, i);
+            }
+        }
+        free(cpu_event_fds[cpu_id]);
+    }
+    
+    return 0;
+}
diff --git a/src/power.c b/src/power.c
index 3f4118c..d76c965 100644
--- a/src/power.c
+++ b/src/power.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Module implementing Intel RAPL interface
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,136 +34,476 @@
 
 #include <types.h>
 #include <power.h>
-#include <cpuid.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 PowerInfo power_info;
-const uint32_t power_regs[4] = {MSR_PKG_ENERGY_STATUS,
-                                MSR_PP0_ENERGY_STATUS,
-                                MSR_PP1_ENERGY_STATUS,
-                                MSR_DRAM_ENERGY_STATUS};
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
+static int power_initialized = 0;
 
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void
+int
 power_init(int cpuId)
 {
     uint64_t flags;
-    int hasRAPL = 0;
-    uint32_t info_register = 0x0;
+    int i;
+    int err;
 
     /* determine Turbo Mode features */
     double busSpeed;
+    if (power_initialized)
+    {
+        return 0;
+    }
+
+    power_info.baseFrequency = 0;
+    power_info.minFrequency = 0;
+    power_info.turbo.numSteps = 0;
+    power_info.turbo.steps = NULL;
+    power_info.powerUnit = 0;
+    power_info.timeUnit = 0;
+    power_info.hasRAPL = 0;
 
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-            (cpuid_info.model == SANDYBRIDGE) ||
-            (cpuid_info.model == HASWELL) ||
-            (cpuid_info.model == HASWELL_EX) ||
-            (cpuid_info.model == IVYBRIDGE_EP) ||
-            (cpuid_info.model == IVYBRIDGE))
+    switch (cpuid_info.model)
     {
-        hasRAPL = 1;
-        info_register = MSR_PKG_POWER_INFO;
+        case SANDYBRIDGE:
+        case IVYBRIDGE:
+        case HASWELL:
+        case SANDYBRIDGE_EP:
+        case IVYBRIDGE_EP:
+        case HASWELL_EP:
+        case ATOM_SILVERMONT_E:
+        case ATOM_SILVERMONT_Z1:
+        case ATOM_SILVERMONT_Z2:
+        case ATOM_SILVERMONT_F:
+        case BROADWELL:
+        case BROADWELL_E:
+        case BROADWELL_D:
+        case HASWELL_M1:
+        case HASWELL_M2:
+        case SKYLAKE1:
+        case SKYLAKE2:
+            power_info.hasRAPL = 1;
+            break;
+        case ATOM_SILVERMONT_C:
+            power_info.hasRAPL = 1;
+            /* The info_regs list needs an update for Silvermont Type C
+               because it uses another info register */
+            info_regs[PKG] = MSR_PKG_POWER_INFO_SILVERMONT;
+            break;
+        default:
+            DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, NO RAPL SUPPORT);
+            return 0;
+            break;
     }
-    else if (cpuid_info.model == ATOM_SILVERMONT_C)
+
+    perfmon_init_maps();
+    if (!HPMinitialized())
     {
-        hasRAPL = 1;
-        info_register = MSR_PKG_POWER_INFO_SILVERMONT;
+        HPMinit();
+        HPMaddThread(cpuId);
     }
-    else if ((cpuid_info.model == ATOM_SILVERMONT_E) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F3))
+    if ( power_info.hasRAPL )
     {
-        hasRAPL = 1;
+        busSpeed = 100.0;
+    }
+    else
+    {
+        busSpeed = 133.33;
     }
-
     if (cpuid_info.turbo)
     {
-        flags = msr_read(cpuId, MSR_PLATFORM_INFO);
-
-        if ( hasRAPL )
-        {
-            busSpeed = 100.0;
-        }
-        else 
+        err = HPMread(cpuId, MSR_DEV, MSR_PLATFORM_INFO, &flags);
+        if (err == 0)
         {
-            busSpeed = 133.33;
-        }
-
-        power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
-        power_info.minFrequency  = busSpeed * (double) extractBitField((flags>>(32)),8,8);
+            power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
+            power_info.minFrequency  = busSpeed * (double) extractBitField((flags>>(32)),8,8);
 
-        power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
-        power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
-
-        flags = msr_read(cpuId, MSR_TURBO_RATIO_LIMIT);
+            power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
+            if (cpuid_info.model == WESTMERE_EX)
+            {
+                power_info.turbo.numSteps = 4;
+            }
+            power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
+            if (!power_info.turbo.steps)
+            {
+                return -ENOMEM;
+            }
 
-        for (int i=0; i < power_info.turbo.numSteps; i++)
-        {
-            if (i < 8)
+            err = HPMread(cpuId, MSR_DEV, MSR_TURBO_RATIO_LIMIT, &flags);
+            if (err)
             {
-                power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+                fprintf(stderr,"Cannot gather values from MSR_TURBO_RATIO_LIMIT,\n");
             }
             else
             {
-                power_info.turbo.steps[i] = power_info.turbo.steps[7];
+                for (int i=0; i < power_info.turbo.numSteps; i++)
+                {
+                    if (i < 8)
+                    {
+                        power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+                    }
+                    else
+                    {
+                        power_info.turbo.steps[i] = power_info.turbo.steps[7];
+                    }
+                }
             }
+            //TODO: Haswell EP and possibly Broadwell EP support multiple turbo 
+            //      registers besides MSR_TURBO_RATIO_LIMIT:
+            //      MSR_TURBO_RATIO_LIMIT1 and MSR_TURBO_RATIO_LIMIT2
+        }
+        else
+        {
+            fprintf(stderr,"Cannot gather values from MSR_PLATFORM_INFO,\n");
         }
-    }
-    else
-    {
-        power_info.turbo.numSteps = 0;
     }
 
     /* determine RAPL parameters */
-    if ( hasRAPL )
+    if ( power_info.hasRAPL )
     {
-        flags = msr_read(cpuId, MSR_RAPL_POWER_UNIT);
-
-        power_info.powerUnit = pow(0.5,(double) extractBitField(flags,4,0));
-        power_info.energyUnit = pow(0.5,(double) extractBitField(flags,5,8));
-        power_info.timeUnit = pow(0.5,(double) extractBitField(flags,4,16));
-
-        if (info_register != 0x0)
+        err = HPMread(cpuId, MSR_DEV, MSR_RAPL_POWER_UNIT, &flags);
+        if (err == 0)
         {
-            flags = msr_read(cpuId, info_register);
-            power_info.tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
-            if (cpuid_info.model != ATOM_SILVERMONT_C)
+            double energyUnit;
+            power_info.powerUnit = 1000000 / (1<<(flags & 0xF));
+            power_info.timeUnit = 1000000 / (1 << ((flags>>16) & 0xF));
+            if (cpuid_info.model != ATOM_SILVERMONT_E)
             {
-                power_info.minPower =  (double) extractBitField(flags,15,16) * power_info.powerUnit;
-                power_info.maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
-                power_info.maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+                energyUnit = 1.0 / (1 << ((flags >> 8) & 0x1F));
             }
             else
             {
-                power_info.minPower = 0.0;
-                power_info.maxPower = 0.0;
-                power_info.maxTimeWindow = 0.0;
+                energyUnit = 1.0 * (1 << ((flags >> 8) & 0x1F)) / 1000000;
+            }
+            
+            for (i = 0; i < NUM_POWER_DOMAINS; i++)
+            {
+                power_info.domains[i].energyUnit = energyUnit;
+                power_info.domains[i].type = i;
+                power_info.domains[i].supportFlags = 0x0U;
+                power_info.domains[i].tdp = 0.0;
+                power_info.domains[i].minPower = 0.0;
+                power_info.domains[i].maxPower = 0.0;
+                power_info.domains[i].maxTimeWindow = 0.0;
+            }
+            
+            if ((cpuid_info.model == HASWELL_EP) ||
+                (cpuid_info.model == HASWELL_M1) ||
+                (cpuid_info.model == HASWELL_M2))
+            {
+                power_info.domains[DRAM].energyUnit = 15.3E-6;
+            }
+
+            for(i = 0; i < NUM_POWER_DOMAINS; i++)
+            {
+                err = HPMread(cpuId, MSR_DEV, power_regs[i], &flags);
+                if (err == 0)
+                {
+                    power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_STATUS;
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, RAPL domain %s not supported, power_names[i]);
+                    continue;
+                }
+                if (limit_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, limit_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_LIMIT;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating limit register for RAPL domain %s, power_names[i]);
+                        limit_regs[i] = 0x0;
+                    }
+                }
+                if (info_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, info_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_INFO;
+                        power_info.domains[i].tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
+                        if (cpuid_info.model != ATOM_SILVERMONT_C)
+                        {
+                            power_info.domains[i].minPower = (double) extractBitField(flags,15,16) * power_info.powerUnit;
+                            power_info.domains[i].maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
+                            power_info.domains[i].maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+                        }
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating info register for RAPL domain %s, power_names[i]);
+                        info_regs[i] = 0x0;
+                    }
+                }
+                if (policy_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, policy_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_POLICY;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating policy register for RAPL domain %s, power_names[i]);
+                        policy_regs[i] = 0x0;
+                    }
+                }
+                if (perf_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, perf_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_PERF;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating perf register for RAPL domain %s, power_names[i]);
+                        perf_regs[i] = 0x0;
+                    }
+                }
             }
         }
         else
         {
-            power_info.tdp = 0;
-            power_info.minPower = 0.0;
-            power_info.maxPower = 0.0;
-            power_info.maxTimeWindow = 0.0;
+            fprintf(stderr,"Cannot gather values from MSR_RAPL_POWER_UNIT, deactivating RAPL support\n");
+            power_info.hasRAPL =  0;
         }
+        power_initialized = 1;
+        return power_info.hasRAPL;
     }
     else
     {
-        power_info.powerUnit = 0.0;
-        power_info.energyUnit = 0.0;
-        power_info.timeUnit = 0.0;
-        power_info.tdp = 0;
-        power_info.minPower = 0.0;
-        power_info.maxPower = 0.0;
-        power_info.maxTimeWindow = 0.0;
+        return power_info.hasRAPL;
+    }
+    return 0;
+}
+
+/* All functions below are experimental and probably don't work */
+int power_perfGet(int cpuId, PowerType domain, uint32_t* status)
+{
+    int err = 0;
+    *status = 0x0U;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+    {
+        err = HPMread(cpuId, MSR_DEV, perf_regs[domain], (uint64_t*)status);
+        if (err)
+        {
+            ERROR_PRINT(Failed to get power perf value for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    fprintf(stderr, "Not implemented\n");
+    return 0;
+
+    uint32_t X = (log(time) - log(power_info.timeUnit))/log(2);
+    uint32_t powerField = (uint32_t)(power/(power_info.domains[domain].energyUnit));
+    uint64_t flags = (powerField & 0xFFFF)|((X & (0x1F))<<17);
+    // Construct flags missing. How is timeField calculated?
+    if (doClamping)
+    {
+        flags |= (1ULL<<16);
+    }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+            return -EFAULT;
+        }
     }
+    return 0;
 }
 
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time)
+{
+    int err = 0;
+    *power = 0;
+    *time = 0;
+    unsigned int Y,Z;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        *power = ((double)extractBitField(flags, 15, 0)) * power_info.domains[domain].energyUnit;
+        Y = extractBitField(flags, 5, 17);
+        Z = extractBitField(flags, 2, 22);
+        *time = pow(2,((double)Y)) * (1.0 + (((double)Z)/4.0)) * power_info.timeUnit;
+    }
+    return 0;
+}
+
+int power_limitState(int cpuId, PowerType domain)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    if (flags & (1ULL<<15))
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int power_limitActivate(int cpuId, PowerType domain)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        flags |= (1ULL<<15);
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_limitDectivate(int cpuId, PowerType domain)
+{
+    int err = 0;
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        flags &= ~(1ULL<<15);
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_policySet(int cpuId, PowerType domain, uint32_t priority)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    priority = extractBitField(priority, 5, 0);
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+    {
+        err = HPMwrite(cpuId, MSR_DEV, policy_regs[domain], priority);
+        if (err)
+        {
+            ERROR_PRINT(Failed to set power policy for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_policyGet(int cpuId, PowerType domain, uint32_t* priority)
+{
+    int err = 0;
+    *priority = 0x0U;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+    {
+        err = HPMread(cpuId, MSR_DEV, policy_regs[domain], (uint64_t*)priority);
+        if (err)
+        {
+            ERROR_PRINT(Failed to get power policy for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+
+void power_finalize(void)
+{
+    if (power_initialized == 0)
+    {
+        return;
+    }
+    if (power_info.turbo.steps != NULL)
+    {
+        free(power_info.turbo.steps);
+    }
+    power_info.turbo.steps = NULL;
+    power_info.baseFrequency = 0;
+    power_info.minFrequency = 0;
+    power_info.turbo.numSteps = 0;
+    power_info.powerUnit = 0;
+    power_info.timeUnit = 0;
+    power_info.hasRAPL = 0;
+    memset(power_info.domains, 0, NUM_POWER_DOMAINS*sizeof(PowerDomain));
+}
+
+PowerInfo_t get_powerInfo(void)
+{
+    return &power_info;
+}
diff --git a/src/pthread-overload/Makefile b/src/pthread-overload/Makefile
index 5f460a5..889d824 100644
--- a/src/pthread-overload/Makefile
+++ b/src/pthread-overload/Makefile
@@ -1,16 +1,18 @@
 # =======================================================================================
-#  
+#
 #      Filename:  Makefile
-# 
+#
 #      Description:  pthread-overload Makefile
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
+#
+#      Version:   4.1
+#      Released:  19.5.2016
+#
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#               Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -28,14 +30,17 @@
 
 include  ../../config.mk
 include  ../../make/include_$(COMPILER).mk
+include  ../../make/config_checks.mk
+include  ../../make/config_defines.mk
+
 
-TARGET   = liblikwidpin.so
+TARGET   = $(PINLIB)
 
 ifneq ($(COLOR),NONE)
 DEFINES += -DCOLOR=$(COLOR)
 endif
 
-DEFINES  += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES  += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -D_GNU_SOURCE
 INCLUDES += -I../includes
 LIBS     += -ldl
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) 
@@ -43,5 +48,5 @@ CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
 all: $(TARGET)
 
 $(TARGET): pthread-overload.c
-	$(CC) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
+	$(CC) -Wl,-soname,$(TARGET).$(VERSION).$(RELEASE) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
 
diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c
index e9d5dcc..f076b08 100644
--- a/src/pthread-overload/pthread-overload.c
+++ b/src/pthread-overload/pthread-overload.c
@@ -3,16 +3,16 @@
  *
  *      Filename:  pthread-overload.c
  *
- *      Description:  Overloaded library for pthread_create call.
+ *      Description:  Overloaded library for pthread_create call. 
  *                    Implements pinning of threads together with likwid-pin.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -60,7 +60,20 @@ static char * sosearchpaths[] = {
     NULL
 };
 
-int
+
+#ifdef COLOR
+#define color_print(format,...) do { \
+        color_on(BRIGHT, COLOR); \
+        printf(format, ##__VA_ARGS__); \
+        color_reset(); \
+    } while(0)
+#else
+#define color_print(format,...) do { \
+        printf(format, ##__VA_ARGS__); \
+    } while(0)
+#endif
+
+int __attribute__ ((visibility ("default") ))
 pthread_create(pthread_t* thread,
         const pthread_attr_t* attr,
         void* (*start_routine)(void *),
@@ -74,28 +87,28 @@ pthread_create(pthread_t* thread,
     static int npinned = 0;
     static int ncalled = 0;
     static int overflow = 0;
+    static int overflowed = 0;
     static int silent = 0;
     static int pin_ids[MAX_NUM_THREADS];
-    static uint64_t skipMask = 0;
-    static int got_skipMask = 0;
+    static uint64_t skipMask = 0x0;
+    static int ncpus = 0;
 
 
     /* On first entry: Get Evironment Variable and initialize pin_ids */
     if (ncalled == 0)
     {
-        char *str = getenv("LIKWID_SKIP");
+        char *str;
         char *token, *saveptr;
         char *delimiter = ",";
         int i = 0;
-        int ncpus = 0;
+        cpu_set_t cpuset;
 
+        str = getenv("LIKWID_SKIP");
         if (str != NULL)
         {
-            skipMask = strtoul(str, &str, 10);
-            got_skipMask = 1;
+            skipMask = strtoul(str, &str, 16);
         }
-
-        if ( got_skipMask == 0 && skipMask == 0x0 )
+        else if ( skipMask == 0x0 )
         {
             dlerror();    /* Clear any existing error */
             dlsym(RTLD_DEFAULT,"__kmpc_begin");
@@ -104,18 +117,16 @@ pthread_create(pthread_t* thread,
                 skipMask = 0x1;
             }
         }
+
+
         if (getenv("LIKWID_SILENT") != NULL)
         {
             silent = 1;
         }
-        else
-        {
-            color_on(BRIGHT, COLOR);
-        }
 
         if (!silent)
         {
-            printf("[pthread wrapper] ");
+            color_print("[pthread wrapper] \n");
         }
 
         str = getenv("LIKWID_PIN");
@@ -132,35 +143,32 @@ pthread_create(pthread_t* thread,
                     pin_ids[i++] = strtoul(token, &token, 10);
                 }
             }
-            ncpus--; /* last ID is the first (the process was pinned to) */
+            CPU_ZERO(&cpuset);
+            CPU_SET(pin_ids[ncpus-1], &cpuset);
+            ret = sched_setaffinity(getpid(), sizeof(cpu_set_t), &cpuset);
+            if (!silent)
+            {
+                color_print("[pthread wrapper] MAIN -> %d\n",pin_ids[ncpus-1]);
+            }
+            //ncpus--; /* last ID is the first (the process was pinned to) */
         }
         else
         {
-            printf("[pthread wrapper] ERROR: Environment Variabel LIKWID_PIN not set!\n");
+            color_print("[pthread wrapper] ERROR: Environment Variabel LIKWID_PIN not set!\n");
         }
 
         if (!silent)
         {
-            printf("[pthread wrapper] PIN_MASK: ");
+            color_print("[pthread wrapper] PIN_MASK: ");
 
-            for (int i=0;i<ncpus;i++)
+            for (int i=0;i<ncpus-1;i++)
             {
-                printf("%d->%d  ",i,pin_ids[i]);
+                color_print("%d->%d  ",i,pin_ids[i]);
             }
-            printf("\n");
-            printf("[pthread wrapper] SKIP MASK: 0x%llX\n",LLU_CAST skipMask);
+            color_print("\n[pthread wrapper] SKIP MASK: 0x%llX\n",LLU_CAST skipMask);
         }
 
-        overflow = ncpus;
-    }
-    else
-    {
-#ifdef COLOR
-        if (!silent)
-        {
-            color_on(BRIGHT, COLOR);
-        }
-#endif
+        overflow = ncpus-1;
     }
 
     /* Handle dll related stuff */
@@ -171,7 +179,7 @@ pthread_create(pthread_t* thread,
         {
             break;
         }
-        if (sosearchpaths[reallpthrindex] != NULL)
+        if (sosearchpaths[reallpthrindex] != NULL) 
         {
             reallpthrindex++;
         }
@@ -181,7 +189,7 @@ pthread_create(pthread_t* thread,
 
     if (!handle)
     {
-        printf("%s\n", dlerror());
+        color_print("%s\n", dlerror());
         return -1;
     }
 
@@ -190,7 +198,7 @@ pthread_create(pthread_t* thread,
 
     if ((error = dlerror()) != NULL)
     {
-        printf("%s\n", error);
+        color_print("%s\n", error);
         return -2;
     }
 
@@ -205,39 +213,39 @@ pthread_create(pthread_t* thread,
         {
             if (!silent)
             {
-                printf("\tthreadid %lu -> SKIP \n", *thread);
+                color_print("\tthreadid %lu -> SKIP \n", *thread);
             }
         }
         else
         {
             CPU_ZERO(&cpuset);
-            CPU_SET(pin_ids[npinned], &cpuset);
+            CPU_SET(pin_ids[npinned%ncpus], &cpuset);
             pthread_setaffinity_np(*thread, sizeof(cpu_set_t), &cpuset);
-
-            if (npinned == overflow)
+            if ((npinned == overflow) && (!overflowed))
             {
                 if (!silent)
                 {
-                    printf("Roundrobin placement triggered\n");
-                    printf("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned]);
+                    color_print("Roundrobin placement triggered\n\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned%ncpus]);
                 }
-                npinned = 0;
+                overflowed = 1;
+                npinned = (npinned+1)%ncpus;
             }
             else
             {
                 if (!silent)
                 {
-                    printf("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned]);
+                    color_print("\tthreadid %lu -> core %d - OK", *thread, pin_ids[npinned%ncpus]);
                 }
                 npinned++;
+                if ((npinned >= ncpus) && (overflowed))
+                {
+                    npinned = 0;
+                }
             }
 
             if (!silent)
             {
-#ifdef COLOR
-                color_reset();
-#endif
-                printf("\n");
+                color_print("\n");
             }
         }
     }
diff --git a/src/strUtil.c b/src/strUtil.c
deleted file mode 100644
index cf37920..0000000
--- a/src/strUtil.c
+++ /dev/null
@@ -1,975 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil.c
- *
- *      Description:  Utility routines for strings. Depends on bstring lib.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sched.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <affinity.h>
-#include <cpuid.h>
-#include <pci.h>
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-static int
-cpu_count(cpu_set_t* set)
-{
-    uint32_t i;
-    int s = 0;
-    const __cpu_mask *p = set->__bits;
-    const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
-
-    while (p < end)
-    {
-        __cpu_mask l = *p++;
-
-        if (l == 0)
-        {
-            continue;
-        }
-
-        for (i=0; i< (sizeof(__cpu_mask)*8); i++)
-        {
-            if (l&(1UL<<i))
-            {
-            s++;
-            }
-        }
-    }
-
-    return s;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-int str2int(const char* str)
-{
-    char* endptr;
-    errno = 0;
-    unsigned long val;
-    val = strtoul(str, &endptr, 10);
-    if ((errno == ERANGE && val == LONG_MAX )
-        || (errno != 0 && val == 0))
-    {
-        ERROR;
-    }
-
-    if (endptr == str)
-    {
-        ERROR_PRINT(Cannot parse string %s to digits, str);
-    }
-
-    return (int) val;
-}
-
-uint32_t
-bstr_to_cpuset_physical(uint32_t* threads,  const_bstring q)
-{
-    int i;
-    unsigned int rangeBegin;
-    unsigned int rangeEnd;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-
-    tokens = bsplit(q,',');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],'-');
-
-        if( subtokens->qty == 1 )
-        {
-            threads[numThreads] = str2int((char *) bdata(subtokens->entry[0]));
-            numThreads++;
-        }
-        else if ( subtokens->qty == 2 )
-        {
-            rangeBegin = str2int((char*) bdata(subtokens->entry[0]));
-            rangeEnd = str2int((char*) bdata(subtokens->entry[1]));
-
-            if (!(rangeBegin <= rangeEnd))
-            {
-                ERROR_PRINT(Range End %d bigger than begin %d, rangeEnd, rangeBegin);
-            }
-
-            while (rangeBegin <= rangeEnd) {
-                threads[numThreads] = rangeBegin;
-                numThreads++;
-                rangeBegin++;
-            }
-        }
-        else
-        {
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-    if (numThreads > MAX_NUM_THREADS)
-    {
-        ERROR_PRINT(Number Of threads %d too large, numThreads);
-    }
-
-    bstrListDestroy(tokens);
-
-    return numThreads;
-}
-
-uint32_t
-bstr_to_cpuset_logical(uint32_t* threads,  const_bstring q)
-{
-    int i;
-    uint32_t j;
-    int id;
-    uint32_t tmpThreads[MAX_NUM_THREADS];
-    int globalNumThreads=0;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    tokens = bsplit(q,'@');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty == 2 )
-        {
-            domain =  affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]);
-
-            for (j=0; j<numThreads; j++)
-                {
-                if (! (tmpThreads[j] >= domain->numberOfProcessors))
-                {
-                    id = (tmpThreads[j]/domain->numberOfCores) +
-                        (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-                    threads[globalNumThreads++] = domain->processorList[id];
-                }
-                else
-                {
-                    ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1);
-                }
-            }
-        }
-        else
-        {
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-
-    return globalNumThreads;
-}
-
-#define PRINT_EXPR_ERR printf("SYNTAX ERROR: Expression must have the format E:<thread domain>:<num threads>[:chunk size>:<stride>]\n")
-
-uint32_t
-bstr_to_cpuset_expression(uint32_t* threads,  const_bstring qi)
-{
-    int i;
-    uint32_t j;
-    bstring q = (bstring) qi;
-    int globalNumThreads=0;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    bdelete (q, 0, 2);
-    tokens = bsplit(q,'@');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty == 2 )
-        {
-            domain =  affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = str2int(bdata(subtokens->entry[1]));
-
-            if (numThreads > domain->numberOfProcessors)
-            {
-                ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,
-                            domain->numberOfProcessors-1);
-            }
-
-            for (j=0; j<numThreads; j++)
-            {
-                threads[globalNumThreads++] = domain->processorList[j];
-            }
-        }
-        else if ( subtokens->qty == 4 )
-        {
-            int counter;
-            int currentId = 0;
-            int startId = 0;
-            int chunksize =  str2int(bdata(subtokens->entry[2]));
-            int stride =  str2int(bdata(subtokens->entry[3]));
-            domain = affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = str2int(bdata(subtokens->entry[1]));
-
-            if (numThreads > domain->numberOfProcessors)
-            {
-                ERROR_PRINT(Invalid number of processors requested. Available 0-%d,
-                            domain->numberOfProcessors-1);
-            }
-
-
-            counter = 0;
-            for (j=0; j<numThreads; j+=chunksize)
-            {
-                for(i=0;i<chunksize && j+i<numThreads ;i++)
-                {
-                    threads[globalNumThreads++] = domain->processorList[counter+i];
-                }
-                counter += stride;
-                if (counter >= domain->numberOfProcessors)
-                {
-                    counter = 0;
-                }
-            }
-        }
-        else
-        {
-            PRINT_EXPR_ERR;
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-
-    return globalNumThreads;
-}
-
-uint32_t
-bstr_to_cpuset_scatter(uint32_t* threads,  const_bstring qi)
-{
-    int domainId = 0;
-    int id = 0;
-    int threadId = 0;
-    bstring q = (bstring) qi;
-    bstring domaintag;
-    int globalNumThreads=0;
-    struct bstrList* subtokens;
-    int numberOfDomains = 0;
-    AffinityDomain* domain;
-    AffinityDomain* tmpDomainPtr;
-
-    domain = (AffinityDomain*) malloc(cpuid_topology.numHWThreads * sizeof(AffinityDomain));
-
-    subtokens = bsplit(q,':');
-
-    if ( subtokens->qty == 2 )
-    {
-        for(int i =0;;i++)
-        {
-            domaintag = bformat("%s%d",bdata(subtokens->entry[0]),i);
-            tmpDomainPtr = (AffinityDomain*) affinity_getDomain(domaintag);
-
-            if (tmpDomainPtr == NULL)
-            {
-                break;
-            }
-            else
-            {
-                memcpy(domain+i,tmpDomainPtr,sizeof(AffinityDomain));
-                numberOfDomains++;
-            }
-        }
-
-        threads[globalNumThreads++] = domain[domainId].processorList[0];
-
-        for (uint32_t i=1; i<cpuid_topology.numHWThreads; i++)
-        {
-            domainId = i%numberOfDomains;
-
-            if (domainId == 0)
-            {
-                threadId++;
-            }
-
-            id = (threadId/domain->numberOfCores) +
-                (threadId%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-
-            threads[globalNumThreads++] = domain[domainId].processorList[id];
-        }
-    }
-    else
-    {
-        PRINT_EXPR_ERR;
-        ERROR_PLAIN_PRINT(Parse Error);
-    }
-    bstrListDestroy(subtokens);
-    free(domain);
-
-    return globalNumThreads;
-}
-
-
-
-#define CPUSET_ERROR  \
-    if (cpuid_isInCpuset()) {  \
-        ERROR_PLAIN_PRINT(You are running inside a cpuset. In cpusets only logical pinning inside set is allowed!);  \
-    }
-
-
-
-int
-bstr_to_cpuset(int* threadsIN,  const_bstring q)
-{
-    uint32_t i;
-    int num=0;
-    int cpuMapping[cpuid_topology.numHWThreads];
-    cpu_set_t cpu_set;
-    uint32_t numThreads;
-    bstring domainStr = bformat("NSCM");
-    const_bstring  scatter = bformat("scatter");
-    struct bstrList* tokens;
-    CPU_ZERO(&cpu_set);
-    sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set);
-    uint32_t* threads = (uint32_t*) threadsIN;
-
-    if (binchr (q, 0, domainStr) !=  BSTR_ERR)
-    {
-        CPUSET_ERROR;
-
-        if (binstr (q, 0 , scatter ) !=  BSTR_ERR)
-        {
-          numThreads =  bstr_to_cpuset_scatter(threads,q);
-        }
-        else if (bstrchr (q, 'E') !=  BSTR_ERR)
-        {
-          numThreads =  bstr_to_cpuset_expression(threads,q);
-        }
-        else
-        {
-          numThreads =  bstr_to_cpuset_logical(threads,q);
-        }
-    }
-    else if (bstrchr (q, 'L') !=  BSTR_ERR)
-    {
-        uint32_t count = cpu_count(&cpu_set);
-
-        tokens = bsplit(q,':');
-        numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]);
-
-        for (i=0; i <  cpuid_topology.numHWThreads; i++)
-        {
-            if (CPU_ISSET(i,&cpu_set))
-            {
-                cpuMapping[num++]=i;
-            }
-        }
-
-        for (i=0; i < numThreads; i++)
-        {
-            if (!(threads[i] >= count))
-            {
-                threads[i] = cpuMapping[threads[i]];
-            }
-            else
-            {
-                fprintf(stderr, "Available CPUs: ");
-                for (int j=0; j< num-1;j++)
-                {
-                    fprintf(stderr, "%d,", cpuMapping[j]);
-                }
-                fprintf(stderr, "%d\n", cpuMapping[num-1]);
-                ERROR_PRINT(Index %d out of range.,threads[i]);
-            }
-        }
-        bstrListDestroy(tokens);
-    }
-    else
-    {
-        CPUSET_ERROR;
-        numThreads = bstr_to_cpuset_physical(threads,q);
-    }
-
-    bdestroy(domainStr);
-    return (int) numThreads;
-}
-
-
-void
-bstr_to_eventset(StrUtilEventSet* set, const_bstring q)
-{
-    int i;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-
-    tokens = bsplit(q,',');
-    set->numberOfEvents = tokens->qty;
-    set->events = (StrUtilEvent*)
-    malloc(set->numberOfEvents * sizeof(StrUtilEvent));
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty != 2 )
-        {
-          
-            fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n"
-                          ,bdata(tokens->entry[i]));
-            fprintf(stderr, "Format: <eventName>:<counter>,...\n");
-            msr_finalize();
-            pci_finalize();
-            exit(EXIT_FAILURE);
-
-        }
-        else
-        {
-            set->events[i].eventName = bstrcpy(subtokens->entry[0]);
-            set->events[i].counterName = bstrcpy(subtokens->entry[1]);
-        }
-
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-}
-
-FILE*
-bstr_to_outstream(const_bstring argString, bstring filter)
-{
-    int i;
-    char* cstr;
-    FILE* STREAM;
-    struct bstrList* tokens;
-    bstring base;
-    bstring suffix = bfromcstr(".");
-    bstring filename;
-
-    /* configure filter */
-    tokens = bsplit(argString,'.');
-
-    if (tokens->qty < 2)
-    {
-        fprintf(stderr, "Outputfile has no filetype suffix!\n");
-        fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    base = bstrcpy(tokens->entry[0]);
-
-    if (biseqcstr(tokens->entry[1],"txt"))
-    {
-        bassigncstr(filter, "NO");
-    }
-    else
-    {
-        bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH));
-        bconchar(filter,'/');
-        bconcat(filter,tokens->entry[1]);
-    }
-
-    bconcat(suffix,tokens->entry[1]);
-    bstrListDestroy(tokens);
-
-    tokens = bsplit(base,'_');
-
-    if (tokens->qty < 1)
-    {
-        ERROR_PLAIN_PRINT(Error in parsing file string);
-    }
-
-    filename = bstrcpy(tokens->entry[0]);
-
-    for (i=1; i<tokens->qty; i++)
-    {
-        if (biseqcstr(tokens->entry[i],"%j"))
-        {
-            cstr = getenv("PBS_JOBID");
-            if (cstr != NULL) 
-            {
-                bcatcstr(filename, "_");
-                bcatcstr(filename, cstr);
-            }
-        }
-        else if (biseqcstr(tokens->entry[i],"%r"))
-        {
-            cstr = getenv("PMI_RANK");
-            if (cstr == NULL) 
-            {
-                cstr = getenv("OMPI_COMM_WORLD_RANK");
-            }
-            if (cstr != NULL) 
-            {
-                bcatcstr(filename, "_");
-                bcatcstr(filename, cstr);
-            }
-        }
-        else if (biseqcstr(tokens->entry[i],"%h"))
-        {
-            cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char));
-            gethostname(cstr,HOST_NAME_MAX);
-            bcatcstr(filename, "_");
-            bcatcstr(filename, cstr);
-            free(cstr);
-        }
-        else if (biseqcstr(tokens->entry[i],"%p"))
-        {
-            bstring pid = bformat("_%d",getpid());
-            bconcat(filename, pid);
-            bdestroy(pid);
-        }
-        else 
-        {
-            ERROR_PLAIN_PRINT(Unsupported placeholder in filename!);
-        }
-    }
-
-    if (biseqcstr(filter,"NO"))
-    {
-        bconcat(filename, suffix);
-    }
-    else
-    {
-        bcatcstr(filter, " ");
-        bcatcstr(filename, ".tmp");
-        bconcat(filter, filename);
-    }
-
-    bstrListDestroy(tokens);
-    STREAM = fopen(bdata(filename),"w");
-    bdestroy(filename);
-    bdestroy(suffix);
-    bdestroy(base);
-
-    return STREAM;
-}
-
-
-uint64_t
-bstr_to_doubleSize(const_bstring str, DataType type)
-{
-    bstring unit = bmidstr(str, blength(str)-2, 2);
-    bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-    uint64_t sizeU = str2int(bdata(sizeStr));
-    uint64_t junk = 0;
-    uint64_t bytesize = 0;
-
-    switch (type)
-    {
-        case SINGLE:
-        case SINGLE_RAND:
-            bytesize = sizeof(float);
-            break;
-
-        case DOUBLE:
-        case DOUBLE_RAND:
-            bytesize = sizeof(double);
-            break;
-    }
-
-    if (biseqcstr(unit, "kB")) {
-        junk = (sizeU *1024)/bytesize;
-    } else if (biseqcstr(unit, "MB")) {
-        junk = (sizeU *1024*1024)/bytesize;
-    } else if (biseqcstr(unit, "GB")) {
-        junk = (sizeU *1024*1024*1024)/bytesize;
-    }
-
-    return junk;
-}
-
-void
-bstr_to_interval(const_bstring str, struct timespec* interval)
-{
-    int size;
-    int pos;
-    bstring ms = bformat("ms");
-
-    if ((pos = bstrrchr (str, 's')) != BSTR_ERR)
-    {
-        if (pos != (blength(str)-1))
-        {
-            fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
-            msr_finalize();
-            exit(EXIT_FAILURE);
-        }
-
-        /* unit is ms */
-        if (binstrr (str, blength(str), ms) != BSTR_ERR)
-        {
-            bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-            size = str2int(bdata(sizeStr));
-            if (size >= 1000)
-            {
-                interval->tv_sec = size/1000;
-                interval->tv_nsec = (size%1000) * 1.E06;
-            }
-            else
-            {
-                interval->tv_sec = 0L;
-                interval->tv_nsec = size * 1.E06;
-            }
-        }
-        /* unit is s */
-        else 
-        {
-            bstring sizeStr = bmidstr(str, 0, blength(str)-1);
-            size = str2int(bdata(sizeStr));
-            interval->tv_sec = size;
-            interval->tv_nsec = 0L;
-        }
-    }
-    else
-    {
-        fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
-        msr_finalize();
-        exit(EXIT_FAILURE);
-    }
-}
-
-
-void
-bstr_to_workgroup(Workgroup* group,
-    const_bstring str,
-    DataType type,
-    int numberOfStreams)
-{
-    uint32_t i;
-    int parseStreams = 0;
-    bstring threadInfo;
-    bstring streams= bformat("0");
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    /* split the workgroup into the thread and the streams part */
-    tokens = bsplit(str,'-');
-
-    if (tokens->qty == 2)
-    {
-        threadInfo = bstrcpy(tokens->entry[0]);
-        streams = bstrcpy(tokens->entry[1]);
-        parseStreams = 1;
-    }
-    else if (tokens->qty == 1)
-    {
-        threadInfo = bstrcpy(tokens->entry[0]);
-    }
-    else
-    {
-        ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-    }
-
-    bstrListDestroy (tokens);
-    tokens = bsplit(threadInfo,':');
-
-    if (tokens->qty == 5)
-    {
-        uint32_t maxNumThreads;
-        int chunksize;
-        int stride;
-        int counter;
-        int currentId = 0;
-        int startId = 0;
-
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-          fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
-              bdata(tokens->entry[0]));
-          exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-        chunksize = str2int(bdata(tokens->entry[3]));
-        stride = str2int(bdata(tokens->entry[4]));
-        maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
-
-        if (group->numberOfThreads > maxNumThreads)
-        {
-          fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
-                        bdata(tokens->entry[0]), maxNumThreads);
-          exit(EXIT_FAILURE);
-        }
-
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        counter = chunksize;
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            if (counter)
-            {
-                group->processorIds[i] = domain->processorList[currentId++];
-            }
-            else
-            {
-                startId += stride;
-                currentId = startId;
-                group->processorIds[i] = domain->processorList[currentId++];
-                counter = chunksize;
-            }
-            counter--;
-        }
-    }
-    else if (tokens->qty == 3)
-    {
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-            fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0]));
-            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
-            exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-
-        if (group->numberOfThreads > domain->numberOfProcessors)
-        {
-            fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
-                            bdata(tokens->entry[0]),domain->numberOfProcessors);
-            exit(EXIT_FAILURE);
-        }
-
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            group->processorIds[i] = domain->processorList[i];
-        }
-    }
-    else if (tokens->qty == 2)
-    {
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-            fprintf(stderr, "Error: Domain %s not available on current machine.\n",
-                            bdata(tokens->entry[0]));
-            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
-            exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = domain->numberOfProcessors;
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            group->processorIds[i] = domain->processorList[i];
-        }
-    }
-    else
-    {
-    ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-    }
-
-    bstrListDestroy(tokens);
-
-    /* parse stream list */
-    if (parseStreams)
-    {
-        tokens = bsplit(streams,',');
-
-        if (tokens->qty < numberOfStreams)
-        {
-            ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams);
-        }
-
-        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
-        for (i=0;i<(uint32_t) tokens->qty;i++)
-        {
-            subtokens = bsplit(tokens->entry[i],':');
-
-            if ( subtokens->qty == 3 )
-            {
-                int index = str2int(bdata(subtokens->entry[0]));
-                if (index >= numberOfStreams)
-                {
-                    ERROR_PRINT(Stream Index %d out of range,index);
-                }
-                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-                group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
-            }
-            else if ( subtokens->qty == 2 )
-            {
-                int index = str2int(bdata(subtokens->entry[0]));
-                if (index >= numberOfStreams)
-                {
-                    ERROR_PRINT(Stream Index %d out of range,index);
-                }
-                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-                group->streams[index].offset = 0;
-            }
-            else
-            {
-                ERROR_PLAIN_PRINT(Error in parsing event string);
-            }
-
-            bstrListDestroy(subtokens);
-        }
-
-        bstrListDestroy(tokens);
-    }
-    else
-    {
-        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
-        for (i=0; i< (uint32_t)numberOfStreams; i++)
-        {
-            group->streams[i].domain = domain->tag;
-            group->streams[i].offset = 0;
-        }
-    }
-
-    group->size /= numberOfStreams;
-}
-
-
-#define INIT_SECURE_INPUT_LENGTH 256
-
-bstring
-bSecureInput (int maxlen, char* vgcCtx) {
-    int i, m, c = 1;
-    bstring b, t;
-    int termchar = 0;
-
-    if (!vgcCtx) return NULL;
-
-    b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, "");
-
-    for (i=0; ; i++)
-    {
-        if (termchar == c)
-        {
-            break;
-        }
-        else if ((maxlen > 0) && (i >= maxlen))
-        {
-            b = NULL;
-            return b;
-        }
-        else
-        {
-            c = *(vgcCtx++);
-        }
-
-        if (EOF == c)
-        {
-            break;
-        }
-
-        if (i+1 >= b->mlen) {
-
-            /* Double size, but deal with unusual case of numeric
-             overflows */
-
-            if ((m = b->mlen << 1)   <= b->mlen &&
-                (m = b->mlen + 1024) <= b->mlen &&
-                (m = b->mlen + 16)   <= b->mlen &&
-                (m = b->mlen + 1)    <= b->mlen)
-            {
-                t = NULL;
-            }
-            else
-            {
-                t = bfromcstralloc (m, "");
-            }
-
-            if (t)
-            {
-                memcpy (t->data, b->data, i);
-            }
-
-            bdestroy (b); /* Clean previous buffer */
-            b = t;
-            if (!b)
-            {
-                return b;
-            }
-        }
-
-        b->data[i] = (unsigned char) c;
-    }
-
-    i--;
-    b->slen = i;
-    b->data[i] = (unsigned char) '\0';
-    return b;
-}
-
-
-int
-bJustifyCenter (bstring b, int width) 
-{
-    unsigned char space  = ' ';
-    int alignSpace = (width - b->slen) / 2;
-    int restSpace = (width - b->slen) % 2;
-    if (width <= 0) return -__LINE__;
-
-    if (b->slen <= width)
-    {
-        binsertch (b, 0, alignSpace, space);
-    }
-
-    binsertch (b, b->slen , alignSpace+restSpace, space);
-
-    return BSTR_OK;
-}
-
-
diff --git a/src/thermal.c b/src/thermal.c
index 0812086..e5cf7a9 100644
--- a/src/thermal.c
+++ b/src/thermal.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Module implementing Intel TM/TM2 interface
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,7 +34,7 @@
 
 #include <types.h>
 #include <thermal.h>
-#include <cpuid.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
@@ -49,10 +49,15 @@ ThermalInfo thermal_info;
 void thermal_init(int cpuId)
 {
     uint64_t flags=0ULL;
+    HPMinit();
+    HPMaddThread(cpuId);
 
     if ( cpuid_hasFeature(TM2) )
     {
-        flags = msr_read(cpuId, IA32_THERM_STATUS);
+        if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &flags))
+        {
+            return;
+        }
 
         if ( flags & 0x1 )
         {
@@ -66,7 +71,10 @@ void thermal_init(int cpuId)
         thermal_info.resolution =  extractBitField(flags,4,27);
 
         flags = 0ULL;
-        flags = msr_read(cpuId, MSR_TEMPERATURE_TARGET);
+        if (HPMread(cpuId, MSR_DEV, MSR_TEMPERATURE_TARGET, &flags))
+        {
+            return;
+        }
         thermal_info.activationT =  extractBitField(flags,8,16);
         thermal_info.offset = extractBitField(flags,6,24);
     }
diff --git a/src/threads.c b/src/threads.c
deleted file mode 100644
index 87fa2b2..0000000
--- a/src/threads.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads.c
- *
- *      Description:  High level interface to pthreads
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <threads.h>
-
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-pthread_barrier_t threads_barrier;
-ThreadData* threads_data;
-ThreadGroup* threads_groups;
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static pthread_t* threads = NULL;
-static pthread_attr_t attr;
-static int numThreads = 0;
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-threads_init(FILE* OUTSTREAM, int numberOfThreads)
-{
-    int i;
-    numThreads = numberOfThreads;
-
-    threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
-    threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
-
-    for(i = 0; i < numThreads; i++)
-    {
-        threads_data[i].numberOfThreads = numThreads;
-        threads_data[i].globalNumberOfThreads = numThreads;
-        threads_data[i].globalThreadId = i;
-        threads_data[i].threadId = i;
-        threads_data[i].output = OUTSTREAM;
-    }
-
-    pthread_barrier_init(&threads_barrier, NULL, numThreads);
-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-}
-
-
-void 
-threads_create(void *(*startRoutine)(void*))
-{
-    int i;
-
-    for(i = 0; i < numThreads; i++)
-    {
-        pthread_create(&threads[i],
-                &attr,
-                startRoutine,
-                (void*) &threads_data[i]); 
-    }
-}
-
-void 
-threads_createGroups(int numberOfGroups)
-{
-    int i;
-    int j;
-    int numThreadsPerGroup;
-    int globalId = 0;
-
-    if (numThreads % numberOfGroups)
-    {
-        ERROR_PRINT(Not enough threads %d to create %d groups,numThreads,numberOfGroups);
-    }
-    else
-    {
-        numThreadsPerGroup = numThreads / numberOfGroups;
-    }
-
-    threads_groups = (ThreadGroup*) malloc(numberOfGroups *
-            sizeof(ThreadGroup));
-
-    for (i = 0; i < numberOfGroups; i++)
-    {
-        threads_groups[i].numberOfThreads = numThreadsPerGroup;
-        threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup *
-                sizeof(int));
-
-        for (j = 0; j < numThreadsPerGroup; j++)
-        {
-            threads_data[globalId].threadId = j;
-            threads_data[globalId].groupId = i;
-            threads_data[globalId].numberOfGroups = numberOfGroups;
-            threads_data[globalId].numberOfThreads = numThreadsPerGroup;
-            threads_groups[i].threadIds[j] = globalId++;
-        }
-    }
-}
-
-
-void 
-threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
-{
-    int i;
-
-    if (func == NULL)
-    {
-        for(i = 0; i < numThreads; i++)
-        {
-            threads_data[i].data = (*data);
-        }
-    }
-    else
-    {
-        for(i = 0; i < numThreads; i++)
-        {
-            func( data, &threads_data[i].data);
-        }
-    }
-}
-
-void
-threads_registerDataThread(int threadId,
-        ThreadUserData* data,
-        threads_copyDataFunc func)
-{
-    if (func == NULL)
-    {
-        threads_data[threadId].data = (*data);
-    }
-    else
-    {
-        func( data, &threads_data[threadId].data);
-    }
-}
-
-void
-threads_registerDataGroup(int groupId,
-        ThreadUserData* data,
-        threads_copyDataFunc func)
-{
-    int i;
-
-    if (func == NULL)
-    {
-        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
-        {
-            threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
-        }
-    }
-    else
-    {
-        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
-        {
-            func( data,
-                    &threads_data[threads_groups[groupId].threadIds[i]].data);
-        }
-    }
-}
-
-void
-threads_join(void)
-{
-    int i;
-
-    for(i=0; i < numThreads; i++)
-    {
-        pthread_join(threads[i], NULL);
-    }
-
-    pthread_attr_destroy(&attr);
-    pthread_barrier_destroy(&threads_barrier);
-}
-
-void
-threads_destroy(int numberOfGroups)
-{
-    int i;
-    free(threads_data);
-    for(i=0;i<numberOfGroups;i++)
-    {
-        free(threads_groups[i].threadIds);
-    }
-    free(threads_groups);
-    free(threads);
-}
diff --git a/src/timer.c b/src/timer.c
index 337c13d..ce43bba 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Implementation of timer module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -27,23 +27,182 @@
  *
  * =======================================================================================
  */
-
+/* #####   HEADER FILE INCLUDES   ######################################### */
 #include <stdlib.h>
 #include <stdio.h>
-#include <time.h>
+#include <unistd.h>
 #include <sys/time.h>
+#include <time.h>
 
 #include <types.h>
-#include <timer.h>
+#include <error.h>
+#include <likwid.h>
+#include <cpuid.h>
 
+/* #####   EXPORTED VARIABLES   ########################################### */
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 static uint64_t baseline = 0ULL;
 static uint64_t cpuClock = 0ULL;
+static uint64_t cyclesClock = 0ULL;
+static uint64_t sleepbase = 0ULL;
+static int timer_initialized = 0;
+
+void (*TSTART)(TscCounter*) = NULL;
+void (*TSTOP)(TscCounter*) = NULL;
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+#if defined(__x86_64)
+static void fRDTSC(TscCounter* cpu_c)
+{
+    __asm__ volatile("xor %%eax,%%eax\n\t"           \
+    "cpuid\n\t"           \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
+
+static void fRDTSC_CR(TscCounter* cpu_c)
+{
+    __asm__ volatile(   \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
+#ifndef __MIC__
+static void fRDTSCP(TscCounter* cpu_c)
+{
+    __asm__ volatile(     \
+    "rdtscp\n\t"          \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    "cpuid\n\t"           \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
+#endif
+#endif
+
+#if defined(__i386__)
+static void fRDTSC(TscCounter* cpu_c)
+{
+    uint64_t tmp;
+    __asm__ volatile( \
+    "xchgl %%ebx, %2\n\t"  \
+    "xor %%eax,%%eax\n\t" \
+    "cpuid\n\t"           \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    "xchgl %2, %%ebx\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi), "=m" (tmp) \
+    : : "%eax","%ecx","%edx");
+}
 
+static void fRDTSC_CR(TscCounter* cpu_c)
+{
+    __asm__ volatile(     \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%edx");
+}
+#ifndef __MIC__
+static void fRDTSCP(TscCounter* cpu_c)
+{
+    uint64_t tmp;
+    __asm__ volatile(     \
+    "rdtscp\n\t"          \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    "xchgl %%ebx, %2\n\t"  \
+    "cpuid\n\t"           \
+    "xchgl %2, %%ebx\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi), "=m" (tmp) \
+    : : "%eax","%ecx","%edx");
+}
+#endif
+#endif
+static void _timer_start( TimerData* time )
+{
+#if defined(__x86_64) || defined(__i386__)
+    if (TSTART)
+        TSTART(&(time->start));
+#endif
+#ifdef _ARCH_PPC
+    uint32_t tbl, tbu0, tbu1;
+
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
 
-static uint64_t
+    time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
+
+static void _timer_stop( TimerData* time )
+{
+#if defined(__x86_64) || defined(__i386__)
+    if (TSTOP)
+        TSTOP(&(time->stop));
+#endif
+#ifdef _ARCH_PPC
+    uint32_t tbl, tbu0, tbu1;
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
+
+    time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
+
+static uint64_t _timer_printCycles( TimerData* time )
+{
+    /* clamp to zero if something goes wrong */
+    if (((time->stop.int64-baseline) < time->start.int64) ||
+        (time->start.int64 == time->stop.int64))
+    {
+        return 0ULL;
+    }
+    else
+    {
+        return (time->stop.int64 - time->start.int64 - baseline);
+    }
+}
+
+/* Return time duration in seconds */
+static double _timer_print( TimerData* time )
+{
+    uint64_t cycles;
+    /* clamp to zero if something goes wrong */
+    if (((time->stop.int64-baseline) < time->start.int64) ||
+        (time->start.int64 == time->stop.int64))
+    {
+        cycles = 0ULL;
+    }
+    else
+    {
+        cycles = time->stop.int64 - time->start.int64 - baseline;
+    }
+    return  ((double) cycles / (double) cyclesClock);
+}
+
+static void
 getCpuSpeed(void)
 {
-#ifdef __x86_64
+#if defined(__x86_64) || defined(__i386__)
+    int i;
     TimerData data;
     TscCounter start;
     TscCounter stop;
@@ -51,36 +210,40 @@ getCpuSpeed(void)
     struct timeval tv1;
     struct timeval tv2;
     struct timezone tzp;
-    struct timespec delay = { 0, 800000000 }; /* calibration time: 800 ms */
+    struct timespec delay = { 0, 500000000 }; /* calibration time: 500 ms */
 
-    for (int i=0; i< 10; i++)
+    for (i=0; i< 10; i++)
     {
-        timer_start(&data);
-        timer_stop(&data);
-        result = MIN(result,timer_printCycles(&data));
+        _timer_start(&data);
+        _timer_stop(&data);
+        result = MIN(result,_timer_printCycles(&data));
     }
 
     baseline = result;
     result = 0xFFFFFFFFFFFFFFFFULL;
+    data.stop.int64 = 0;
+    data.start.int64 = 0;
 
-    for (int i=0; i< 2; i++)
+    for (i=0; i< 2; i++)
     {
-        RDTSC(start);
+        _timer_start(&data);
         gettimeofday( &tv1, &tzp);
         nanosleep( &delay, NULL);
-        RDTSC_STOP(stop);
+        _timer_stop(&data);
         gettimeofday( &tv2, &tzp);
 
-        result = MIN(result,(stop.int64 - start.int64));
+        result = MIN(result,(data.stop.int64 - data.start.int64));
     }
 
-    return (result) * 1000000 /
+    cpuClock = (result) * 1000000 /
         (((uint64_t)tv2.tv_sec * 1000000 + tv2.tv_usec) -
          ((uint64_t)tv1.tv_sec * 1000000 + tv1.tv_usec));
+    cyclesClock = cpuClock;
 #endif
 #ifdef _ARCH_PPC
     FILE *fpipe;
     char *command="grep timebase /proc/cpuinfo | awk '{ print $3 }'";
+    char *command2="grep clock /proc/cpuinfo | head -n 1 | awk '{ print $3 }'";
     char buff[256];
 
     if ( !(fpipe = (FILE*)popen(command,"r")) )
@@ -91,55 +254,223 @@ getCpuSpeed(void)
 
     fgets(buff, 256, fpipe);
 
-    return (uint64_t)   atoi(buff);
+    cyclesClock = (uint64_t)   atoi(buff);
+    if ( !(fpipe = (FILE*)popen(command2,"r")) )
+    {  // If fpipe is NULL
+        perror("Problems with pipe");
+        exit(1);
+    }
+
+    fgets(buff, 256, fpipe);
+
+    cpuClock = (uint64_t)   atoi(buff);
+    cpuClock *= 1E6;
 #endif
 }
 
 
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void init_sleep()
+{
+    int status;
+    TimerData timer;
+    struct timespec req = {0,1};
+    struct timespec rem = {0,0};
+    for (int i=0; i<10; ++i)
+    {
+        _timer_start(&timer);
+        status = clock_nanosleep(CLOCK_REALTIME,0,&req, &rem);
+        _timer_stop(&timer);
+        if (_timer_print(&timer)*1E6 > sleepbase)
+        {
+            sleepbase = _timer_print(&timer)*1E6 + 2;
+        }
+    }
+}
+
+
 void timer_init( void )
 {
-    cpuClock = getCpuSpeed();
+    uint32_t eax = 0x0,ebx = 0x0,ecx = 0x0,edx = 0x0;
+    if (timer_initialized == 1)
+    {
+        return;
+    }
+    if ((!TSTART) && (!TSTOP))
+    {
+        TSTART = fRDTSC;
+        eax = 0x80000001;
+        CPUID (eax, ebx, ecx, edx);
+#ifndef __MIC__
+        if (edx & (1<<27))
+        {
+            TSTOP = fRDTSCP;
+        }
+        else
+        {
+            TSTOP = fRDTSC_CR;
+        }
+#else
+        TSTOP = fRDTSC_CR;
+#endif
+    }
+    if (cpuClock == 0ULL)
+    {
+        getCpuSpeed();
+    }
+    timer_initialized = 1;
 }
 
 uint64_t timer_printCycles( TimerData* time )
 {
-    /* clamp to zero if something goes wrong */
-    if ((time->stop.int64-baseline) < time->start.int64)
+    if (timer_initialized != 1)
     {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
         return 0ULL;
     }
-    else
-    {
-        return (time->stop.int64 - time->start.int64 - baseline);
-    }
+    return _timer_printCycles(time);
 }
 
 /* Return time duration in seconds */
 double timer_print( TimerData* time )
 {
     uint64_t cycles;
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return 0ULL;
+    }
+    return _timer_print(time);
+}
 
-    /* clamp to zero if something goes wrong */
-    if ((time->stop.int64-baseline) < time->start.int64)
+uint64_t timer_getCpuClock( void )
+{
+    if (timer_initialized != 1)
     {
-        cycles = 0ULL;
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return 0ULL;
     }
-    else
+    return cpuClock;
+}
+
+uint64_t timer_getCpuClockCurrent( int cpu_id )
+{
+    int err;
+    uint64_t clock = 0x0ULL;
+    FILE *fpipe;
+    char cmd[256];
+    char buff[256];
+    char* eptr, *rptr;
+
+    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu_id);
+    if (access(buff, R_OK))
     {
-        cycles = time->stop.int64 - time->start.int64 - baseline;
+        ERROR_PRINT(File %s not readable, buff);
+        return clock;
+    }
+    sprintf(cmd, "cat %s", buff);
+    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
+    {  // If fpipe is NULL
+        ERROR_PRINT(Problems reading cpu frequency of CPU %d, cpu_id);
+        return clock;
     }
 
-    return  ((double) cycles / (double) cpuClock);
+    rptr = fgets(buff, 256, fpipe);
+    if (rptr != NULL)
+    {
+        clock = strtoull(buff, &eptr, 10);
+    }
+    return clock *1E3;
 }
 
-uint64_t timer_getCpuClock( void )
+uint64_t timer_getCycleClock( void )
 {
-    return cpuClock;
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return 0ULL;
+    }
+    return cyclesClock;
 }
 
 uint64_t timer_getBaseline( void )
 {
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return 0ULL;
+    }
     return baseline;
 }
 
+void timer_start( TimerData* time )
+{
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return;
+    }
+    _timer_start(time);
+}
+
+
+void timer_stop( TimerData* time )
+{
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return;
+    }
+    _timer_stop(time);
+}
+
+
 
+int timer_sleep(unsigned long usec)
+{
+    int status = -1;
+    struct timespec req;
+    struct timespec rem = {0,0};
+    if (sleepbase == 0x0ULL)
+    {
+        init_sleep();
+    }
+    if (usec >= 1000000)
+    {
+        status = sleep(usec / 1000000);
+    }
+    else
+    {
+        req.tv_sec = 0;
+        req.tv_nsec = (usec-sleepbase)*1.E3;
+        status = clock_nanosleep(CLOCK_REALTIME,0,&req, &rem);
+        if ((status == -1) && (errno == EINTR))
+        {
+            status = (rem.tv_sec * 1E6) + (rem.tv_nsec * 1E-3);
+        }
+    }
+    return status;
+}
+
+
+void timer_finalize(void)
+{
+    if (timer_initialized != 1)
+    {
+        ERROR_PLAIN_PRINT(Timer module not properly initialized);
+        return;
+    }
+    baseline = 0ULL;
+    cpuClock = 0ULL;
+    TSTART = NULL;
+    TSTOP = NULL;
+    timer_initialized = 0;
+}
+
+void timer_reset( TimerData* time )
+{
+    time->start.int64 = 0;
+    time->stop.int64 = 0;
+}
diff --git a/src/topology.c b/src/topology.c
new file mode 100644
index 0000000..602abf2
--- /dev/null
+++ b/src/topology.c
@@ -0,0 +1,1041 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology.c
+ *
+ *      Description:  Interface to the topology backends
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <likwid.h>
+
+#include <topology.h>
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+//#include <strUtil.h>
+#include <configuration.h>
+
+
+static int topology_initialized = 0;
+CpuInfo cpuid_info;
+CpuTopology cpuid_topology;
+
+int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+
+static char* pentium_m_b_str = "Intel Pentium M Banias processor";
+static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
+static char* core_duo_str = "Intel Core Duo processor";
+static char* core_2a_str = "Intel Core 2 65nm processor";
+static char* core_2b_str = "Intel Core 2 45nm processor";
+static char* atom_45_str = "Intel Atom 45nm processor";
+static char* atom_32_str = "Intel Atom 32nm processor";
+static char* atom_22_str = "Intel Atom 22nm processor";
+static char* atom_silvermont_str = "Intel Atom (Silvermont) processor";
+static char* atom_airmont_str = "Intel Atom (Airmont) processor";
+static char* atom_goldmont_str = "Intel Atom (Goldmont) processor";
+static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
+static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
+static char* nehalem_west_str = "Intel Core Westmere processor";
+static char* sandybridge_str = "Intel Core SandyBridge processor";
+static char* ivybridge_str = "Intel Core IvyBridge processor";
+static char* ivybridge_ep_str = "Intel Xeon IvyBridge EN/EP/EX processor";
+static char* sandybridge_ep_str = "Intel Xeon SandyBridge EN/EP processor";
+static char* haswell_str = "Intel Core Haswell processor";
+static char* haswell_ep_str = "Intel Xeon Haswell EN/EP/EX processor";
+static char* broadwell_str = "Intel Core Broadwell processor";
+static char* broadwell_d_str = "Intel Xeon D Broadwell processor";
+static char* broadwell_ep_str = "Intel Xeon Broadwell EN/EP/EX processor";
+static char* skylake_str = "Intel Skylake processor";
+static char* nehalem_ex_str = "Intel Nehalem EX processor";
+static char* westmere_ex_str = "Intel Westmere EX processor";
+static char* xeon_mp_string = "Intel Xeon MP processor";
+static char* xeon_phi_string = "Intel Xeon Phi (Knights Corner) Coprocessor";
+static char* xeon_phi2_string = "Intel Xeon Phi (Knights Landing) Coprocessor";
+static char* barcelona_str = "AMD Barcelona processor";
+static char* shanghai_str = "AMD Shanghai processor";
+static char* istanbul_str = "AMD Istanbul processor";
+static char* magnycours_str = "AMD Magny Cours processor";
+static char* interlagos_str = "AMD Interlagos processor";
+static char* kabini_str = "AMD Family 16 model - Kabini processor";
+static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
+static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
+static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
+static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
+static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
+static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
+static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
+static char* amd_k8_str = "AMD K8 architecture";
+static char* unknown_intel_str = "Unknown Intel Processor";
+static char* unknown_amd_str = "Unknown AMD Processor";
+
+static char* short_core2 = "core2";
+static char* short_atom = "atom";
+static char* short_pm = "pentiumm";
+static char* short_silvermont = "silvermont";
+static char* short_goldmont = "goldmont";
+static char* short_nehalem = "nehalem";
+static char* short_nehalemEX = "nehalemEX";
+static char* short_westmere = "westmere";
+static char* short_westmereEX = "westmereEX";
+static char* short_haswell = "haswell";
+static char* short_haswell_ep = "haswellEP";
+static char* short_broadwell = "broadwell";
+static char* short_broadwell_d = "broadwellD";
+static char* short_broadwell_ep = "broadwellEP";
+static char* short_ivybridge = "ivybridge";
+static char* short_ivybridge_ep = "ivybridgeEP";
+static char* short_sandybridge = "sandybridge";
+static char* short_sandybridge_ep = "sandybridgeEP";
+static char* short_skylake = "skylake";
+static char* short_phi = "phi";
+static char* short_phi2 = "phi2";
+static char* short_k8 = "k8";
+static char* short_k10 = "k10";
+static char* short_k15 = "interlagos";
+static char* short_k16 = "kabini";
+static char* short_unknown = "unknown";
+
+
+
+int cpu_count(cpu_set_t* set)
+{
+    uint32_t i;
+    int s = 0;
+    const __cpu_mask *p = set->__bits;
+    const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
+
+    while (p < end)
+    {
+        __cpu_mask l = *p++;
+
+        if (l == 0)
+        {
+            continue;
+        }
+
+        for (i=0; i< (sizeof(__cpu_mask)*8); i++)
+        {
+            if (l&(1UL<<i))
+            {
+                s++;
+            }
+        }
+    }
+
+    return s;
+}
+
+static void initTopologyFile(FILE* file)
+{
+    size_t items;
+    HWThread* hwThreadPool;
+    CacheLevel* cacheLevels;
+    TreeNode* currentNode;
+
+    items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
+
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
+    cpuid_topology.threadPool = hwThreadPool;
+
+    cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
+    items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
+    cpuid_topology.cacheLevels = cacheLevels;
+    cpuid_topology.topologyTree = NULL;
+
+    tree_init(&cpuid_topology.topologyTree, 0);
+
+    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+    {
+        if (!tree_nodeExists(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId))
+        {
+            tree_insertNode(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId);
+        }
+        currentNode = tree_getNode(cpuid_topology.topologyTree,
+                hwThreadPool[i].packageId);
+
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+        {
+            tree_insertNode(currentNode, hwThreadPool[i].coreId);
+        }
+        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+
+        if (!tree_nodeExists(currentNode, i))
+        {
+            tree_insertNode(currentNode, i);
+            affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+        }
+    }
+}
+
+
+static int readTopologyFile(const char* filename)
+{
+    FILE* fp;
+    char structure[256];
+    char field[256];
+    char value[256];
+    char line[512];
+    int numHWThreads = -1;
+    int numCacheLevels = -1;
+    int numberOfNodes = -1;
+    int* tmpNumberOfProcessors;
+    int counter;
+    int i;
+    uint32_t tmp, tmp1;
+
+    fp = fopen(filename, "r");
+
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s", structure, field);
+        if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numHWThreads") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numHWThreads);
+        }
+        else if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numCacheLevels") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numCacheLevels);
+        }
+        else if ((strcmp(structure, "numa_info") == 0) && (strcmp(field, "numberOfNodes") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numberOfNodes);
+        }
+        if ((numHWThreads >= 0) && (numCacheLevels >= 0) && (numberOfNodes >= 0))
+        {
+            break;
+        }
+    }
+    if (numHWThreads < 0 || numCacheLevels < 0 || numberOfNodes < 0)
+    {
+        ERROR_PRINT(Cannot read topology information from file %s, filename);
+        fclose(fp);
+        return -1;
+    }
+
+    tmpNumberOfProcessors = (int*) malloc(numberOfNodes *sizeof(int));
+    fseek(fp, 0, SEEK_SET);
+    counter = 0;
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s %d %s = %d", structure, field, &tmp, value, &tmp1);
+        if ((strcmp(structure, "numa_info") == 0) && (strcmp(value, "numberOfProcessors") == 0))
+        {
+            tmpNumberOfProcessors[tmp-1] = tmp1;
+            counter++;
+        }
+        if (counter == numberOfNodes)
+        {
+            break;
+        }
+    }
+
+    cpuid_topology.threadPool = (HWThread*)malloc(numHWThreads * sizeof(HWThread));
+    cpuid_topology.cacheLevels = (CacheLevel*)malloc(numCacheLevels * sizeof(CacheLevel));
+    cpuid_topology.numHWThreads = numHWThreads;
+    cpuid_topology.numCacheLevels = numCacheLevels;
+
+    numa_info.nodes = (NumaNode*) malloc(numberOfNodes * sizeof(NumaNode));
+    numa_info.numberOfNodes = numberOfNodes;
+
+    for(i=0;i<numberOfNodes;i++)
+    {
+        numa_info.nodes[i].processors = (uint32_t*) malloc (tmpNumberOfProcessors[i] * sizeof(int));
+        numa_info.nodes[i].distances = (uint32_t*) malloc (numberOfNodes * sizeof(int));
+    }
+    free(tmpNumberOfProcessors);
+
+    fseek(fp, 0, SEEK_SET);
+
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s", structure, field);
+        if (strcmp(structure, "cpuid_topology") == 0)
+        {
+            if (strcmp(field, "numSockets") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numSockets = tmp;
+            }
+            else if (strcmp(field, "numCoresPerSocket") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numCoresPerSocket = tmp;
+            }
+            else if (strcmp(field, "numThreadsPerCore") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numThreadsPerCore = tmp;
+            }
+            else if (strcmp(field, "threadPool") == 0)
+            {
+                int thread;
+
+                sscanf(line, "%s %s %d %s = %d", structure, field, &thread, value, &tmp);
+
+                if (strcmp(value, "threadId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].threadId = tmp;
+                }
+                else if (strcmp(value, "coreId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].coreId = tmp;
+                }
+                else if (strcmp(value, "packageId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].packageId = tmp;
+                }
+                else if (strcmp(value, "apicId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].apicId = tmp;
+                }
+
+            }
+            else if (strcmp(field, "cacheLevels") == 0)
+            {
+                int level;
+                char type[128];
+                sscanf(line, "%s %s %d %s", structure, field, &level, value);
+
+                cpuid_topology.cacheLevels[level-1].level = level-1;
+                if (strcmp(value, "type") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %s", structure, field, &level, value, type);
+                    if (strcmp(type, "UNIFIEDCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = UNIFIEDCACHE;
+                    }
+                    else if (strcmp(type, "DATACACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = DATACACHE;
+                    }
+                    else if (strcmp(type, "INSTRUCTIONCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = INSTRUCTIONCACHE;
+                    }
+                    else if (strcmp(type, "ITLB") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = ITLB;
+                    }
+                    else if (strcmp(type, "DTLB") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = DTLB;
+                    }
+                    else if (strcmp(type, "NOCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = NOCACHE;
+                    }
+                }
+                else
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &level, value, &tmp);
+                    if (strcmp(value, "associativity") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].associativity = tmp;
+                    }
+                    else if (strcmp(value, "sets") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].sets = tmp;
+                    }
+                    else if (strcmp(value, "lineSize") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].lineSize = tmp;
+                    }
+                    else if (strcmp(value, "size") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].size = tmp;
+                    }
+                    else if (strcmp(value, "threads") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].threads = tmp;
+                    }
+                    else if (strcmp(value, "inclusive") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].inclusive = tmp;
+                    }
+                }
+
+            }
+        }
+        else if (strcmp(structure, "cpuid_info") == 0)
+        {
+            if (strcmp(field, "family") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.family = tmp;
+
+            }
+            else if (strcmp(field, "model") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.model = tmp;
+            }
+            else if (strcmp(field, "osname") == 0)
+            {
+                strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+                cpuid_info.osname = (char*) malloc((strlen(value)+1) * sizeof(char));
+                strncpy(cpuid_info.osname, value, strlen(value));
+                cpuid_info.osname[strlen(value)-1] = '\0';
+            }
+            else if (strcmp(field, "stepping") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.stepping = tmp;
+
+            }
+            else if (strcmp(field, "clock") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.clock = tmp;
+
+            }
+            else if (strcmp(field, "turbo") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.turbo = tmp;
+
+            }
+            else if (strcmp(field, "isIntel") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.isIntel = tmp;
+
+            }
+            else if (strcmp(field, "featureFlags") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.featureFlags = tmp;
+
+            }
+            else if (strcmp(field, "perf_version") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_version = tmp;
+
+            }
+            else if (strcmp(field, "perf_num_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_num_ctr = tmp;
+
+            }
+            else if (strcmp(field, "perf_width_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_width_ctr = tmp;
+
+            }
+            else if (strcmp(field, "perf_num_fixed_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_num_fixed_ctr = tmp;
+
+            }
+            else if (strcmp(field, "features") == 0)
+            {
+                strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+                cpuid_info.features = (char*) malloc((strlen(value)+1) * sizeof(char));
+                strncpy(cpuid_info.features, value, strlen(value));
+                cpuid_info.features[strlen(value)-1] = '\0';
+            }
+        }
+        else if (strcmp(structure, "numa_info") == 0)
+        {
+            if (strcmp(field, "nodes") == 0)
+            {
+                int id;
+                sscanf(line, "%s %s %d %s", structure, field, &id, value);
+
+                if (strcmp(value,"numberOfProcessors") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].numberOfProcessors = tmp;
+                }
+                else if (strcmp(value, "freeMemory") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].freeMemory = tmp;
+                }
+                else if (strcmp(value, "id") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].id = tmp;
+                }
+                else if (strcmp(value, "totalMemory") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].totalMemory = tmp;
+                }
+                else if (strcmp(value, "numberOfDistances") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].numberOfDistances = tmp;
+                }
+                if (strcmp(value, "processors") == 0)
+                {
+                    sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+                    numa_info.nodes[id-1].processors[tmp-1] = tmp1;
+                }
+                else if (strcmp(value,"distances") == 0)
+                {
+                    sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+                    numa_info.nodes[id-1].distances[tmp] = tmp1;
+                }
+            }
+        }
+    }
+    fclose(fp);
+
+    return 0;
+}
+
+int topology_setName(void)
+{
+    switch ( cpuid_info.family )
+    {
+        case P6_FAMILY:
+            switch ( cpuid_info.model )
+            {
+                case PENTIUM_M_BANIAS:
+                    cpuid_info.name = pentium_m_b_str;
+                    cpuid_info.short_name = short_pm;
+                    break;
+
+                case PENTIUM_M_DOTHAN:
+                    cpuid_info.name = pentium_m_d_str;
+                    cpuid_info.short_name = short_pm;
+                    break;
+
+                case CORE_DUO:
+                    cpuid_info.name = core_duo_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case CORE2_65:
+                    cpuid_info.name = core_2a_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case CORE2_45:
+                    cpuid_info.name = core_2b_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case NEHALEM_BLOOMFIELD:
+                    cpuid_info.name = nehalem_bloom_str;
+                    cpuid_info.short_name = short_nehalem;
+                    break;
+
+                case NEHALEM_LYNNFIELD:
+                    cpuid_info.name = nehalem_lynn_str;
+                    cpuid_info.short_name = short_nehalem;
+                    break;
+
+                case NEHALEM_WESTMERE_M:
+
+                case NEHALEM_WESTMERE:
+                    cpuid_info.name = nehalem_west_str;
+                    cpuid_info.short_name = short_westmere;
+                    break;
+
+                case SANDYBRIDGE:
+                    cpuid_info.name = sandybridge_str;
+                    cpuid_info.short_name = short_sandybridge;
+                    break;
+
+                case SANDYBRIDGE_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = sandybridge_ep_str;
+                    cpuid_info.short_name = short_sandybridge_ep;
+                    break;
+
+                case IVYBRIDGE:
+                    cpuid_info.name = ivybridge_str;
+                    cpuid_info.short_name = short_ivybridge;
+                    break;
+
+                case IVYBRIDGE_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = ivybridge_ep_str;
+                    cpuid_info.short_name = short_ivybridge_ep;
+                    break;
+
+                case HASWELL_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = haswell_ep_str;
+                    cpuid_info.short_name = short_haswell_ep;
+                    break;
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    cpuid_info.name = haswell_str;
+                    cpuid_info.short_name = short_haswell;
+                    break;
+
+                case BROADWELL:
+                    cpuid_info.name = broadwell_str;
+                    cpuid_info.short_name = short_broadwell;
+                    break;
+                case BROADWELL_D:
+                    cpuid_info.name = broadwell_d_str;
+                    cpuid_info.short_name = short_broadwell_d;
+                    break;
+                case BROADWELL_E:
+                    cpuid_info.name = broadwell_ep_str;
+                    cpuid_info.short_name = short_broadwell_ep;
+                    break;
+
+                case SKYLAKE1:
+                case SKYLAKE2:
+                    cpuid_info.name = skylake_str;
+                    cpuid_info.short_name = short_skylake;
+                    break;
+
+                case XEON_PHI2:
+                    cpuid_info.name = xeon_phi2_string;
+                    cpuid_info.short_name = short_phi2;
+                    break;
+
+                case NEHALEM_EX:
+                    cpuid_info.name = nehalem_ex_str;
+                    cpuid_info.short_name = short_nehalemEX;
+                    break;
+
+                case WESTMERE_EX:
+                    cpuid_info.name = westmere_ex_str;
+                    cpuid_info.short_name = short_westmereEX;
+                    break;
+
+                case XEON_MP:
+                    cpuid_info.name = xeon_mp_string;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case ATOM_45:
+
+                case ATOM:
+                    cpuid_info.name = atom_45_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_32:
+                    cpuid_info.name = atom_32_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_22:
+                    cpuid_info.name = atom_22_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                    cpuid_info.name = atom_silvermont_str;
+                    cpuid_info.short_name = short_silvermont;
+                    break;
+                case ATOM_SILVERMONT_AIR:
+                    cpuid_info.name = atom_airmont_str;
+                    cpuid_info.short_name = short_silvermont;
+                    break;
+                case ATOM_SILVERMONT_GOLD:
+                    cpuid_info.name = atom_goldmont_str;
+                    cpuid_info.short_name = short_goldmont;
+                    break;
+
+                default:
+                    cpuid_info.name = unknown_intel_str;
+                    cpuid_info.short_name = short_unknown;
+                    break;
+            }
+            break;
+
+        case MIC_FAMILY:
+            switch ( cpuid_info.model )
+            {
+                case XEON_PHI:
+                    cpuid_info.name = xeon_phi_string;
+                    cpuid_info.short_name = short_phi;
+                    break;
+
+            }
+            break;
+
+        case K8_FAMILY:
+
+            if (cpuid_info.isIntel)
+            {
+                ERROR_PLAIN_PRINT(Netburst architecture is not supported);
+            }
+
+            switch ( cpuid_info.model )
+            {
+                case OPTERON_DC_E:
+                    cpuid_info.name = opteron_dc_e_str;
+                    break;
+
+                case OPTERON_DC_F:
+                    cpuid_info.name = opteron_dc_f_str;
+                    break;
+
+                case ATHLON64_X2:
+
+                case ATHLON64_X2_F:
+                    cpuid_info.name = athlon64_str;
+                    break;
+
+                case ATHLON64_F1:
+
+                case ATHLON64_F2:
+                    cpuid_info.name = athlon64_f_str;
+                    break;
+
+                case ATHLON64_X2_G:
+                    cpuid_info.name = athlon64_X2_g_str;
+                    break;
+
+                case ATHLON64_G1:
+
+                case ATHLON64_G2:
+                    cpuid_info.name = athlon64_g_str;
+                    break;
+
+                case OPTERON_SC_1MB:
+                    cpuid_info.name = opteron_sc_str;
+                    break;
+
+                default:
+                    cpuid_info.name = amd_k8_str;
+                    break;
+            }
+            cpuid_info.short_name = short_k8;
+            break;
+
+        case K10_FAMILY:
+            switch ( cpuid_info.model )
+            {
+                case BARCELONA:
+                    cpuid_info.name = barcelona_str;
+                    break;
+
+                case SHANGHAI:
+                    cpuid_info.name = shanghai_str;
+                    break;
+
+                case ISTANBUL:
+                    cpuid_info.name = istanbul_str;
+                    break;
+
+                case MAGNYCOURS:
+                    cpuid_info.name = magnycours_str;
+                    break;
+
+                default:
+                    cpuid_info.name = unknown_amd_str;
+                    break;
+            }
+            cpuid_info.short_name = short_k10;
+            break;
+
+        case K15_FAMILY:
+            cpuid_info.name = interlagos_str;
+            cpuid_info.short_name = short_k15;
+            break;
+
+        case K16_FAMILY:
+            cpuid_info.name = kabini_str;
+            cpuid_info.short_name = short_k16;
+            break;
+
+        default:
+            return EXIT_FAILURE;
+            break;
+    }
+    return EXIT_SUCCESS;
+}
+
+const struct topology_functions topology_funcs = {
+#ifndef LIKWID_USE_HWLOC
+    .init_cpuInfo = cpuid_init_cpuInfo,
+    .init_cpuFeatures = cpuid_init_cpuFeatures,
+    .init_nodeTopology = cpuid_init_nodeTopology,
+    .init_cacheTopology = cpuid_init_cacheTopology,
+    .close_topology = NULL,
+#else
+    .init_cpuInfo = hwloc_init_cpuInfo,
+    .init_nodeTopology = hwloc_init_nodeTopology,
+    .init_cacheTopology = hwloc_init_cacheTopology,
+    .init_cpuFeatures = proc_init_cpuFeatures,
+    .close_topology = hwloc_close,
+#endif
+    .init_fileTopology = initTopologyFile,
+};
+
+
+void topology_setupTree(void)
+{
+    uint32_t i;
+    TreeNode* currentNode;
+    HWThread* hwThreadPool = cpuid_topology.threadPool;
+
+    tree_init(&cpuid_topology.topologyTree, 0);
+    for (i=0; i<  cpuid_topology.numHWThreads; i++)
+    {
+        /* Add node to Topology tree */
+        if (!tree_nodeExists(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId))
+        {
+            //printf("Insert Socket %d\n", hwThreadPool[i].packageId);
+            tree_insertNode(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId);
+        }
+        currentNode = tree_getNode(cpuid_topology.topologyTree,
+                hwThreadPool[i].packageId);
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+        {
+            //printf("Insert Core %d at Socket %d\n", hwThreadPool[i].coreId, hwThreadPool[i].packageId);
+            tree_insertNode(currentNode, hwThreadPool[i].coreId);
+        }
+        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].apicId))
+        {
+            /*
+               printf("WARNING: Thread already exists!\n");
+               */
+            //printf("Insert HWThread %d from Core %d at Socket %d\n", hwThreadPool[i].apicId, hwThreadPool[i].coreId, hwThreadPool[i].packageId);
+            tree_insertNode(currentNode, hwThreadPool[i].apicId);
+            affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+        }
+
+    }
+    cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
+    currentNode = tree_getChildNode(cpuid_topology.topologyTree);
+    cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
+    currentNode = tree_getChildNode(currentNode);
+    cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
+    return;
+}
+
+int topology_init(void)
+{
+    int ret = 0;
+    cpu_set_t cpuSet;
+    struct topology_functions funcs = topology_funcs;
+
+    if (topology_initialized)
+    {
+        return EXIT_SUCCESS;
+    }
+
+    if (init_configuration())
+    {
+        ERROR_PLAIN_PRINT(Cannot initialize configuration module to check for topology file name);
+        return EXIT_FAILURE;
+    }
+
+    if ((config.topologyCfgFileName == NULL) || access(config.topologyCfgFileName, R_OK))
+    {
+standard_init:
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        if (cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF))
+        {
+            funcs.init_cpuInfo = proc_init_cpuInfo;
+            funcs.init_cpuFeatures = proc_init_cpuFeatures;
+            funcs.init_nodeTopology = proc_init_nodeTopology;
+            funcs.init_cacheTopology = proc_init_cacheTopology;
+            cpuid_topology.activeHWThreads =
+                ((cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF)) ?
+                cpu_count(&cpuSet) :
+                sysconf(_SC_NPROCESSORS_CONF));
+        }
+        else
+        {
+            cpuid_topology.activeHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+        }
+        funcs.init_cpuInfo(cpuSet);
+        topology_setName();
+        funcs.init_cpuFeatures();
+        funcs.init_nodeTopology(cpuSet);
+        topology_setupTree();
+        funcs.init_cacheTopology();
+        sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
+    }
+    else
+    {
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        DEBUG_PRINT(DEBUGLEV_INFO, Reading topology information from %s, config.topologyCfgFileName);
+        ret = readTopologyFile(config.topologyCfgFileName);
+        if (ret < 0)
+            goto standard_init;
+        cpuid_topology.activeHWThreads = 0;
+        for (int i=0;i<cpuid_topology.numHWThreads;i++)
+        {
+            if (CPU_ISSET(cpuid_topology.threadPool[i].apicId, &cpuSet))
+            {
+                cpuid_topology.activeHWThreads++;
+                cpuid_topology.threadPool[i].inCpuSet = 1;
+            }
+        }
+        topology_setName();
+        topology_setupTree();
+    }
+
+
+    topology_initialized = 1;
+    return EXIT_SUCCESS;
+}
+
+
+void topology_finalize(void)
+{
+    struct topology_functions funcs = topology_funcs;
+    if (!topology_initialized)
+    {
+        return;
+    }
+    if (cpuid_info.features != NULL)
+    {
+        free(cpuid_info.features);
+        cpuid_info.features = NULL;
+    }
+    if (cpuid_info.osname != NULL)
+    {
+        free(cpuid_info.osname);
+        cpuid_info.osname = NULL;
+    }
+    if (cpuid_topology.cacheLevels != NULL)
+    {
+        free(cpuid_topology.cacheLevels);
+        cpuid_topology.cacheLevels = NULL;
+    }
+    if (cpuid_topology.threadPool != NULL)
+    {
+        free(cpuid_topology.threadPool);
+        cpuid_topology.threadPool = NULL;
+    }
+    if (cpuid_topology.topologyTree != NULL)
+    {
+        tree_destroy(cpuid_topology.topologyTree);
+        cpuid_topology.topologyTree = NULL;
+    }
+    if (topology_funcs.close_topology != NULL)
+    {
+        topology_funcs.close_topology();
+    }
+    cpuid_info.family = 0;
+    cpuid_info.model = 0;
+    cpuid_info.stepping = 0;
+    cpuid_info.clock = 0;
+    cpuid_info.turbo = 0;
+    cpuid_info.name = NULL;
+    cpuid_info.short_name = NULL;
+    cpuid_info.isIntel = 0;
+    cpuid_info.supportUncore = 0;
+    cpuid_info.featureFlags = 0;
+    cpuid_info.perf_version = 0;
+    cpuid_info.perf_num_ctr = 0;
+    cpuid_info.perf_width_ctr = 0;
+    cpuid_info.perf_num_fixed_ctr = 0;
+
+    cpuid_topology.numHWThreads = 0;
+    cpuid_topology.activeHWThreads = 0;
+    cpuid_topology.numSockets = 0;
+    cpuid_topology.numCoresPerSocket = 0;
+    cpuid_topology.numThreadsPerCore = 0;
+    cpuid_topology.numCacheLevels = 0;
+
+    topology_initialized = 0;
+}
+
+
+
+
+
+void print_supportedCPUs (void)
+{
+    printf("Supported Intel processors:\n");
+    printf("\t%s\n",core_2a_str);
+    printf("\t%s\n",core_2b_str);
+    printf("\t%s\n",xeon_mp_string);
+    printf("\t%s\n",atom_45_str);
+    printf("\t%s\n",atom_32_str);
+    printf("\t%s\n",atom_22_str);
+    printf("\t%s\n",nehalem_bloom_str);
+    printf("\t%s\n",nehalem_lynn_str);
+    printf("\t%s\n",nehalem_west_str);
+    printf("\t%s\n",nehalem_ex_str);
+    printf("\t%s\n",westmere_ex_str);
+    printf("\t%s\n",sandybridge_str);
+    printf("\t%s\n",sandybridge_ep_str);
+    printf("\t%s\n",ivybridge_str);
+    printf("\t%s\n",ivybridge_ep_str);
+    printf("\t%s\n",haswell_str);
+    printf("\t%s\n",haswell_ep_str);
+    printf("\t%s\n",atom_silvermont_str);
+    printf("\t%s\n",atom_airmont_str);
+    printf("\t%s\n",xeon_phi_string);
+    printf("\t%s\n",broadwell_str);
+    printf("\t%s\n",broadwell_d_str);
+    printf("\t%s\n",broadwell_ep_str);
+    printf("\t%s\n",skylake_str);
+    printf("\n");
+    printf("Supported AMD processors:\n");
+    printf("\t%s\n",opteron_sc_str);
+    printf("\t%s\n",opteron_dc_e_str);
+    printf("\t%s\n",opteron_dc_f_str);
+    printf("\t%s\n",barcelona_str);
+    printf("\t%s\n",shanghai_str);
+    printf("\t%s\n",istanbul_str);
+    printf("\t%s\n",magnycours_str);
+    printf("\t%s\n",interlagos_str);
+    printf("\t%s\n",kabini_str);
+    printf("\n");
+}
+
+
+
+CpuTopology_t get_cpuTopology(void)
+{
+    return &cpuid_topology;
+}
+
+CpuInfo_t get_cpuInfo(void)
+{
+    return &cpuid_info;
+}
+NumaTopology_t get_numaTopology(void)
+{
+    return &numa_info;
+}
+
diff --git a/src/topology_cpuid.c b/src/topology_cpuid.c
new file mode 100644
index 0000000..504714d
--- /dev/null
+++ b/src/topology_cpuid.c
@@ -0,0 +1,939 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_cpuid.c
+ *
+ *      Description:  Interface to the cpuid based topology backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include <error.h>
+
+#include <tree.h>
+#include <bitUtil.h>
+#include <tlb-info.h>
+#include <topology.h>
+#include <cpuid.h>
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+#define MAX_CACHE_LEVELS 4
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+static int largest_function = 0;        
+static uint32_t eax, ebx, ecx, edx;
+
+/* Dirty hack to avoid nonull warnings */
+char* (*ownstrcpy)(char *__restrict __dest, const char *__restrict __src);
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int intelCpuidFunc_4(CacheLevel** cachePool)
+{
+    int i;
+    int level=0;
+    int maxNumLevels=0;
+    uint32_t valid=1;
+    CacheLevel* pool;
+    while (valid)
+    {
+        eax = 0x04;
+        ecx = level;
+        CPUID(eax, ebx, ecx, edx);
+        valid = extractBitField(eax,5,0);
+        if (!valid)
+        {
+            break;
+        }
+        level++;
+    }
+
+    maxNumLevels = level;
+    *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+    pool = *cachePool;
+
+    for (i=0; i < maxNumLevels; i++) 
+    {
+        eax = 0x04;
+        ecx = i;
+        CPUID(eax, ebx, ecx, edx);
+
+        pool[i].level = extractBitField(eax,3,5);
+        pool[i].type = (CacheType) extractBitField(eax,5,0);
+        pool[i].associativity = extractBitField(ebx,8,22)+1;
+        pool[i].sets = ecx+1;
+        pool[i].lineSize = extractBitField(ebx,12,0)+1;
+        pool[i].size = pool[i].sets *
+            pool[i].associativity *
+            pool[i].lineSize;
+        pool[i].threads = extractBitField(eax,10,14)+1;
+
+        /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
+         * turned off */
+        if (i < 3)
+        {
+            if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+                    (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+                    (cpuid_info.model == NEHALEM_WESTMERE) ||
+                    (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+                    (cpuid_info.model == SANDYBRIDGE) ||
+                    (cpuid_info.model == SANDYBRIDGE_EP) ||
+                    (cpuid_info.model == IVYBRIDGE) ||
+                    (cpuid_info.model == IVYBRIDGE_EP) ||
+                    (cpuid_info.model == HASWELL) ||
+                    (cpuid_info.model == HASWELL_EP) ||
+                    (cpuid_info.model == HASWELL_M1) ||
+                    (cpuid_info.model == HASWELL_M2) ||
+                    (cpuid_info.model == WESTMERE_EX) ||
+                    (cpuid_info.model == NEHALEM_EX))
+            {
+                if (cpuid_topology.numThreadsPerCore == 1)
+                {
+                    pool[i].threads = 1;
+                }
+            }
+        }
+
+        /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes 
+         * too large in here. Ask Intel what is wrong here!
+         * Limit threads per Socket then to the maximum possible value.*/
+        if(pool[i].threads > (int)
+                (cpuid_topology.numCoresPerSocket*
+                 cpuid_topology.numThreadsPerCore))
+        {
+            pool[i].threads = cpuid_topology.numCoresPerSocket*
+                cpuid_topology.numThreadsPerCore;
+        }
+        pool[i].inclusive = edx&0x2;
+    }
+
+    return maxNumLevels;
+}
+
+static uint32_t amdGetAssociativity(uint32_t flag)
+{
+    uint32_t asso= 0;
+
+    switch ( flag )
+    {
+        case 0x0:
+            asso = 0;
+            break;
+
+        case 0x1:
+            asso = 1;
+            break;
+
+        case 0x2:
+            asso = 2;
+            break;
+
+        case 0x4:
+            asso = 4;
+            break;
+
+        case 0x6:
+            asso = 8;
+            break;
+
+        case 0x8:
+            asso = 16;
+            break;
+
+        case 0xA:
+            asso = 32;
+            break;
+
+        case 0xB:
+            asso = 48;
+            break;
+
+        case 0xC:
+            asso = 64;
+            break;
+
+        case 0xD:
+            asso = 96;
+            break;
+
+        case 0xE:
+            asso = 128;
+            break;
+
+        case 0xF:
+            asso = 0;
+            break;
+
+        default:
+            break;
+    }
+    return asso;
+
+}
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void cpuid_printTlbTopology()
+{
+    int i;
+    uint32_t loop = 1;
+
+    if (cpuid_info.isIntel)
+    {
+        eax = 0x02;
+        CPUID(eax, ebx, ecx, edx);
+    
+    
+        loop = extractBitField(eax,8,0);
+        for(i=1;i<loop;i++)
+        {
+            eax = 0x02;
+            CPUID(eax, ebx, ecx, edx);
+        }
+
+        for(i=8;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(eax,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(eax,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(ebx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(ebx,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(ecx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(ecx,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(edx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(edx,8,i)]);
+            }
+        }
+    }
+    else
+    {
+        eax = 0x80000005;
+        CPUID(eax, ebx, ecx, edx);
+        printf("L1DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,24));
+        printf("L1DTlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,16));
+        printf("L1ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,8));
+        printf("L1ITlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,0));
+        ebx = 0x80000005;
+        CPUID(eax, ebx, ecx, edx);
+        printf("L1DTlb4KAssoc: 0x%x\n",extractBitField(ebx,8,24));
+        printf("L1DTlb4KSize: 0x%x\n",extractBitField(ebx,8,16));
+        printf("L1ITlb4KAssoc: 0x%x\n",extractBitField(ebx,8,8));
+        printf("L1ITlb4KSize: 0x%x\n",extractBitField(ebx,8,0));
+        eax = 0x80000006;
+        CPUID(eax, ebx, ecx, edx);
+        printf("L2DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,24));
+        printf("L2DTlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+        printf("L2DTlb2and4MSize: 0x%x\n",extractBitField(eax,12,16));
+        printf("L2ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,12));
+        printf("L2ITlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+        printf("L2ITlb2and4MSize: 0x%x\n",extractBitField(eax,12,0));
+        ebx = 0x80000006;
+        CPUID(eax, ebx, ecx, edx);
+        printf("L2DTlb4KAssoc: 0x%x\n",extractBitField(eax,4,24));
+        printf("L2DTlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+        printf("L2DTlb4KSize: 0x%x\n",extractBitField(eax,12,16));
+        printf("L2ITlb4KAssoc: 0x%x\n",extractBitField(eax,4,12));
+        printf("L2ITlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+        printf("L2ITlb4KSize: 0x%x\n",extractBitField(eax,12,0));
+    }        
+    return;
+}
+
+static void
+cpuid_set_osname(void)
+{
+    FILE *fp;
+    bstring nameString = bformat("model name");
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+    memset(cpuid_info.osname, '\0', MAX_MODEL_STRING_LENGTH);
+    ownstrcpy = strcpy;
+    int i;
+
+    if (NULL != (fp = fopen ("/proc/cpuinfo", "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+            {
+                 struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                 bltrimws(subtokens->entry[1]);
+                 ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+                 bstrListDestroy(subtokens);
+            }
+        }
+        bstrListDestroy(tokens);
+        bdestroy(src);
+    }
+    else
+    {
+        bdestroy(nameString);
+        ERROR;
+    }
+    bdestroy(nameString);
+    fclose(fp);
+}
+
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int cpus_in_set = 0;
+    cpuid_info.isIntel = 1;
+
+    eax = 0x00;
+    CPUID(eax, ebx, ecx, edx);
+
+    largest_function = eax;
+    if (ebx == 0x68747541U)
+    {
+        cpuid_info.isIntel = 0;
+    }
+
+    eax = 0x01;
+    CPUID(eax, ebx, ecx, edx);
+    cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
+    cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
+    cpuid_info.stepping =  (eax&0xFU);
+    cpuid_set_osname();
+    cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+    cpus_in_set = cpu_count(&cpuSet);
+    if (cpus_in_set < cpuid_topology.numHWThreads)
+    {
+        cpuid_topology.numHWThreads = cpus_in_set;
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, CPU-ID CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads,
+                            cpuid_topology.activeHWThreads)
+    return;
+}
+
+void cpuid_init_cpuFeatures(void)
+{
+    eax = 0x01;
+    CPUID(eax, ebx, ecx, edx);
+
+    cpuid_info.featureFlags = 0;
+    cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+    cpuid_info.features[0] = '\0';
+    if (ecx & (1<<0))
+    {
+        strcat(cpuid_info.features, "SSE3 ");
+        cpuid_info.featureFlags |= (1<<SSE3);
+    }
+    if (ecx & (1<<3))
+    {
+        strcat(cpuid_info.features, "MONITOR ");
+        cpuid_info.featureFlags |= (1<<MONITOR);
+    }
+    if (ecx & (1<<5))
+    {
+        strcat(cpuid_info.features, "VMX ");
+        cpuid_info.featureFlags |= (1<<VMX);
+    }
+    if (ecx & (1<<7))
+    {
+        strcat(cpuid_info.features, "EIST ");
+        cpuid_info.featureFlags |= (1<<EIST);
+    }
+    if (ecx & (1<<8))
+    {
+        strcat(cpuid_info.features, "TM2 ");
+        cpuid_info.featureFlags |= (1<<TM2);
+    }
+    if (ecx & (1<<9))
+    {
+        strcat(cpuid_info.features, "SSSE3 ");
+        cpuid_info.featureFlags |= (1<<SSSE3);
+    }
+    if (ecx & (1<<12))
+    {
+        strcat(cpuid_info.features, "FMA ");
+        cpuid_info.featureFlags |= (1<<FMA);
+    }
+    if (ecx & (1<<19))
+    {
+        strcat(cpuid_info.features, "SSE4.1 ");
+        cpuid_info.featureFlags |= (1<<SSE41);
+    }
+    if (ecx & (1<<20))
+    {
+        strcat(cpuid_info.features, "SSE4.2 ");
+        cpuid_info.featureFlags |= (1<<SSE42);
+    }
+    if (ecx & (1<<25))
+    {
+        strcat(cpuid_info.features, "AES ");
+        cpuid_info.featureFlags |= (1<<AES);
+    }
+    if (ecx & (1<<28))
+    {
+        strcat(cpuid_info.features, "AVX ");
+        cpuid_info.featureFlags |= (1<<AVX);
+    }
+    if (ecx & (1<<30))
+    {
+        strcat(cpuid_info.features, "RDRAND ");
+        cpuid_info.featureFlags |= (1<<RDRAND);
+    }
+
+    if (edx & (1<<22))
+    {
+        strcat(cpuid_info.features, "ACPI ");
+        cpuid_info.featureFlags |= (1<<ACPI);
+    }
+    if (edx & (1<<23))
+    {
+        strcat(cpuid_info.features, "MMX ");
+        cpuid_info.featureFlags |= (1<<MMX);
+    }
+    if (edx & (1<<25))
+    {
+        strcat(cpuid_info.features, "SSE ");
+        cpuid_info.featureFlags |= (1<<SSE);
+    }
+    if (edx & (1<<26))
+    {
+        strcat(cpuid_info.features, "SSE2 ");
+        cpuid_info.featureFlags |= (1<<SSE2);
+    }
+    if (edx & (1<<28))
+    {
+        strcat(cpuid_info.features, "HTT ");
+        cpuid_info.featureFlags |= (1<<HTT);
+    }
+    if (edx & (1<<29))
+    {
+        strcat(cpuid_info.features, "TM ");
+        cpuid_info.featureFlags |= (1<<TM);
+    }
+
+    eax = 0x7;
+    ecx = 0x0;
+    CPUID(eax, ebx, ecx, edx);
+    if (ebx & (1<<5))
+    {
+        strcat(cpuid_info.features, "AVX2 ");
+        cpuid_info.featureFlags |= (1<<AVX2);
+    }
+    if (ebx & (1<<11))
+    {
+        strcat(cpuid_info.features, "RTM ");
+        cpuid_info.featureFlags |= (1<<RTM);
+    }
+    if (ebx & (1<<4))
+    {
+        strcat(cpuid_info.features, "HLE ");
+        cpuid_info.featureFlags |= (1<<HLE);
+    }
+    if (ebx & (1<<18))
+    {
+        strcat(cpuid_info.features, "RDSEED ");
+        cpuid_info.featureFlags |= (1<<RDSEED);
+    }
+
+    eax = 0x80000001;
+    CPUID(eax, ebx, ecx, edx);
+    if (edx & (1<<27))
+    {
+        strcat(cpuid_info.features, "RDTSCP ");
+        cpuid_info.featureFlags |= (1<<RDTSCP);
+    }
+
+    cpuid_info.perf_version   =  0;
+    if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+    {
+        eax = 0x0A;
+        CPUID(eax, ebx, ecx, edx);
+        cpuid_info.perf_version   =  (eax&0xFFU);
+        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
+        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
+        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
+
+        eax = 0x06;
+        CPUID(eax, ebx, ecx, edx);
+        if (eax & (1<<1))
+        {
+            cpuid_info.turbo = 1;
+        }
+        else
+        {
+            cpuid_info.turbo = 0;
+        }
+    }
+
+    return;
+}
+
+void cpuid_init_nodeTopology(cpu_set_t cpuSet)
+{
+    uint32_t apicId;
+    uint32_t bitField;
+    int level;
+    int prevOffset = 0;
+    int currOffset = 0;
+    cpu_set_t set;
+    HWThread* hwThreadPool;
+    int hasBLeaf = 0;
+    int maxNumLogicalProcs;
+    int maxNumLogicalProcsPerCore;
+    int maxNumCores;
+    int width;
+    
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    
+    
+    /* check if 0x0B cpuid leaf is supported */
+    if (largest_function >= 0x0B)
+    {
+        eax = 0x0B;
+        ecx = 0;
+        CPUID(eax, ebx, ecx, edx);
+
+        if (ebx)
+        {
+            hasBLeaf = 1;
+        }
+    }
+
+    if (hasBLeaf)
+    {
+        for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
+        {
+            int id;
+            CPU_ZERO(&set);
+            CPU_SET(i,&set);
+            sched_setaffinity(0, sizeof(cpu_set_t), &set);
+            eax = 0x0B;
+            ecx = 0;
+            CPUID(eax, ebx, ecx, edx);
+            apicId = edx;
+            id = i;
+            hwThreadPool[id].apicId = i;
+            hwThreadPool[id].inCpuSet = 0;
+            if (CPU_ISSET(id, &cpuSet))
+            {
+                hwThreadPool[id].inCpuSet = 1;
+            }
+
+            for (level=0; level < 3; level++)
+            {
+                eax = 0x0B;
+                ecx = level;
+                CPUID(eax, ebx, ecx, edx);
+                currOffset = eax&0xFU;
+
+                switch ( level ) {
+                    case 0:  /* SMT thread */
+                        bitField = extractBitField(apicId,
+                                currOffset,
+                                0);
+                        hwThreadPool[id].threadId = bitField;
+                        break;
+
+                    case 1:  /* Core */
+                        bitField = extractBitField(apicId,
+                                currOffset-prevOffset,
+                                prevOffset);
+                        hwThreadPool[id].coreId = bitField;
+                        break;
+
+                    case 2:  /* Package */
+                        bitField = extractBitField(apicId,
+                                32-prevOffset,
+                                prevOffset);
+                        hwThreadPool[id].packageId = bitField;
+                        break;
+
+                }
+                prevOffset = currOffset;
+            }
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+        }
+    }
+    else
+    {
+        switch ( cpuid_info.family )
+        {
+
+            case MIC_FAMILY:
+
+            case P6_FAMILY:
+                eax = 0x01;
+                CPUID(eax, ebx, ecx, edx);
+                maxNumLogicalProcs = extractBitField(ebx,8,16);
+
+                /* Check number of cores per package */
+                eax = 0x04;
+                ecx = 0;
+                CPUID(eax, ebx, ecx, edx);
+                maxNumCores = extractBitField(eax,6,26)+1;
+
+                maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID(eax, ebx, ecx, edx);
+                    id = i;
+                    hwThreadPool[id].apicId = i;//extractBitField(ebx,8,24);
+
+                    /* ThreadId is extracted from th apicId using the bit width
+                     * of the number of logical processors
+                     * */
+                    hwThreadPool[id].threadId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
+
+                    /* CoreId is extracted from th apicId using the bitWidth 
+                     * of the number of logical processors as offset and the
+                     * bit width of the number of cores as width
+                     * */
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                getBitFieldWidth(maxNumCores),
+                                getBitFieldWidth(maxNumLogicalProcsPerCore)); 
+
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                8-getBitFieldWidth(maxNumLogicalProcs),
+                                getBitFieldWidth(maxNumLogicalProcs));
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+                break;
+
+            case K8_FAMILY:
+                /* AMD Bios manual Rev. 2.28 section 3.1
+                 * Legacy method */
+                /*FIXME: This is a bit of a hack */
+
+                maxNumLogicalProcsPerCore = 1;
+                maxNumLogicalProcs = 1;
+
+                eax = 0x80000008;
+                CPUID(eax, ebx, ecx, edx);
+
+                maxNumCores =  extractBitField(ecx,8,0)+1;
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID(eax, ebx, ecx, edx);
+                    id = extractBitField(ebx,8,24);
+                    hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+
+                    /* ThreadId is extracted from th apicId using the bit width
+                     * of the number of logical processors
+                     * */
+                    hwThreadPool[id].threadId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
+
+                    /* CoreId is extracted from th apicId using the bitWidth 
+                     * of the number of logical processors as offset and the
+                     * bit width of the number of cores as width
+                     * */
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                getBitFieldWidth(maxNumCores),
+                                0); 
+
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                8-getBitFieldWidth(maxNumCores),
+                                getBitFieldWidth(maxNumCores));
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+                break;
+
+            case K16_FAMILY:
+
+            case K15_FAMILY:
+
+            case K10_FAMILY:
+                /* AMD Bios manual Rev. 2.28 section 3.2
+                 * Extended method */
+                eax = 0x80000008;
+                CPUID(eax, ebx, ecx, edx);
+
+                width =  extractBitField(ecx,4,12);
+
+                if (width == 0)
+                {
+                    width =  extractBitField(ecx,8,0)+1;
+                }
+
+                eax = 0x01;
+                CPUID(eax, ebx, ecx, edx);
+                maxNumLogicalProcs =  extractBitField(ebx,8,16);
+                maxNumCores = extractBitField(ecx,8,0)+1;
+
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID(eax, ebx, ecx, edx);
+                    id = extractBitField(ebx,8,24);
+                    hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+                    /* AMD only knows cores */
+                    hwThreadPool[id].threadId = 0;
+
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                width, 0); 
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                (8-width), width);
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+
+                break;
+        }
+    }
+    cpuid_topology.threadPool = hwThreadPool;
+    
+    return;
+}
+
+
+void cpuid_init_cacheTopology(void)
+{
+    int maxNumLevels=0;
+    int id=0;
+    CacheLevel* cachePool = NULL;
+    CacheType type = DATACACHE;
+
+    switch ( cpuid_info.family ) 
+    {
+        case MIC_FAMILY:
+
+        case P6_FAMILY:
+
+            if (largest_function >= 4)
+            {
+                maxNumLevels = intelCpuidFunc_4(&cachePool);
+            }
+            else
+            {
+                //                intelCpuidFunc_2(&cachePool);
+            }
+
+            break;
+
+        case K8_FAMILY:
+            maxNumLevels = 2;
+            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+            eax = 0x80000005;
+            CPUID(eax, ebx, ecx, edx);
+            cachePool[0].level = 1;
+            cachePool[0].type = DATACACHE;
+            cachePool[0].associativity = extractBitField(ecx,8,16);
+            cachePool[0].lineSize = extractBitField(ecx,8,0);
+            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[0].sets = cachePool[0].size/
+                    (cachePool[0].associativity * cachePool[0].lineSize);
+            }
+            cachePool[0].threads = 1;
+            cachePool[0].inclusive = 1;
+
+            eax = 0x80000006;
+            CPUID(eax, ebx, ecx, edx);
+            cachePool[1].level = 2;
+            cachePool[1].type = UNIFIEDCACHE;
+            cachePool[1].associativity = 
+                amdGetAssociativity(extractBitField(ecx,4,12));
+            cachePool[1].lineSize = extractBitField(ecx,8,0);
+            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[1].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+            cachePool[1].threads = 1;
+            cachePool[1].inclusive = 1;
+
+            break;
+
+
+        case K10_FAMILY:
+            /* FIXME: Adds one level for the instruction cache on Intel
+             * This fixes the level for the cores
+             */
+            maxNumLevels = 3;
+            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+            eax = 0x80000005;
+            CPUID(eax, ebx, ecx, edx);
+            cachePool[0].level = 1;
+            cachePool[0].type = DATACACHE;
+            cachePool[0].associativity = extractBitField(ecx,8,16);
+            cachePool[0].lineSize = extractBitField(ecx,8,0);
+            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[0].sets = cachePool[0].size/
+                    (cachePool[0].associativity * cachePool[0].lineSize);
+            }
+            cachePool[0].threads = 1;
+            cachePool[0].inclusive = 1;
+
+            eax = 0x80000006;
+            CPUID(eax, ebx, ecx, edx);
+            cachePool[1].level = 2;
+            cachePool[1].type = UNIFIEDCACHE;
+            cachePool[1].associativity = 
+                amdGetAssociativity(extractBitField(ecx,4,12));
+            cachePool[1].lineSize = extractBitField(ecx,8,0);
+            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[1].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+            cachePool[1].threads = 1;
+            cachePool[1].inclusive = 1;
+
+            cachePool[2].level = 3;
+            cachePool[2].type = UNIFIEDCACHE;
+            cachePool[2].associativity =
+                amdGetAssociativity(extractBitField(edx,4,12));
+            cachePool[2].lineSize = extractBitField(edx,8,0);
+            cachePool[2].size =  (extractBitField(edx,14,18)+1) * 524288;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[2].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+
+            if (cpuid_info.model != MAGNYCOURS)
+            {
+                cachePool[2].threads = cpuid_topology.numCoresPerSocket;
+            }
+            else
+            {
+                cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
+                cachePool[2].size /= 2 ;
+            }
+
+            cachePool[2].inclusive = 1;
+
+            break;
+
+        case K16_FAMILY:
+
+        case K15_FAMILY:
+
+            maxNumLevels = 0;
+            cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
+
+            while (type)
+            {
+                ecx = id;
+                eax = 0x8000001D;
+                CPUID(eax, ebx, ecx, edx);
+                type = (CacheType) extractBitField(eax,4,0);
+
+                if ((type == DATACACHE) || (type == UNIFIEDCACHE))
+                {
+                    cachePool[maxNumLevels].level =   extractBitField(eax,3,5);
+                    cachePool[maxNumLevels].type = type;
+                    cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
+                    cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
+                    cachePool[maxNumLevels].sets =  extractBitField(ecx,32,0)+1;
+                    cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
+                        cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
+                    cachePool[maxNumLevels].threads =  extractBitField(eax,12,14)+1;
+                    cachePool[maxNumLevels].inclusive =  (edx & (0x1<<1));
+                    maxNumLevels++;
+                }
+                id++;
+            }
+            break;
+
+        default:
+            ERROR_PLAIN_PRINT(Processor is not supported);
+            break;
+    }
+    
+
+    cpuid_topology.numCacheLevels = maxNumLevels;
+    cpuid_topology.cacheLevels = cachePool;
+    
+    return;
+}
diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c
new file mode 100644
index 0000000..04c2417
--- /dev/null
+++ b/src/topology_hwloc.c
@@ -0,0 +1,327 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_hwloc.c
+ *
+ *      Description:  Interface to the hwloc based topology backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <error.h>
+
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+hwloc_topology_t hwloc_topology = NULL;
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+#ifdef LIKWID_USE_HWLOC
+int likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list)
+{
+    int i;
+    int count = 0;
+    hwloc_obj_t walker;
+    if (!obj) return 0;
+    if (!obj->arity) return 0;
+    for (i=0;i<obj->arity;i++)
+    {
+        walker = obj->children[i];
+        if (walker->type == type)
+        {
+            if (list && *list && index)
+            {
+                (*list)[(*index)++] = walker->os_index;
+            }
+            count++;
+        }
+        count += likwid_hwloc_record_objs_of_type_below_obj(t, walker, type, index, list);
+    }
+    return count;
+}
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int i;
+    hwloc_obj_t obj;
+    if (perfmon_verbosity <= 1)
+    {
+        setenv("HWLOC_HIDE_ERRORS", "1", 1);
+    }
+    likwid_hwloc_topology_init(&hwloc_topology);
+    likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+    likwid_hwloc_topology_load(hwloc_topology);
+    obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_SOCKET, 0);
+
+    cpuid_info.model = 0;
+    cpuid_info.family = 0;
+    cpuid_info.isIntel = 0;
+    cpuid_info.stepping = 0;
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+    cpuid_info.osname[0] = '\0';
+    if (!obj)
+    {
+        return;
+    }
+
+    const char * info;
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUModelNumber")))
+        cpuid_info.model = atoi(info);
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUFamilyNumber")))
+       cpuid_info.family = atoi(info);
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUVendor")))
+        cpuid_info.isIntel = strcmp(info, "GenuineIntel") == 0;
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUModel")))
+        strcpy(cpuid_info.osname, info);
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUStepping")))
+        cpuid_info.stepping = atoi(info);
+
+    cpuid_topology.numHWThreads = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads,
+                            cpuid_topology.activeHWThreads)
+    return;
+}
+
+void hwloc_init_nodeTopology(cpu_set_t cpuSet)
+{
+    HWThread* hwThreadPool;
+    int maxNumLogicalProcs;
+    int maxNumLogicalProcsPerCore;
+    int maxNumCores;
+    hwloc_obj_t obj;
+    int poolsize = 0;
+    int id = 0;
+    hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        if (CPU_ISSET(i, &cpuSet))
+        {
+            poolsize = i+1;
+        }
+    }
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        hwThreadPool[i].apicId = -1;
+        hwThreadPool[i].threadId = -1;
+        hwThreadPool[i].coreId = -1;
+        hwThreadPool[i].packageId = -1;
+        hwThreadPool[i].inCpuSet = 0;
+    }
+
+    maxNumLogicalProcs = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    maxNumCores = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_CORE);
+    if (likwid_hwloc_get_nbobjs_by_type(hwloc_topology, socket_type) == 0)
+    {
+        socket_type = HWLOC_OBJ_NODE;
+    }
+    maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+    for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+    {
+        int skip = 0;
+        obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+        if (!obj)
+        {
+            continue;
+        }
+        id = obj->os_index;
+        hwThreadPool[id].inCpuSet = 1;
+        hwThreadPool[id].apicId = obj->os_index;
+        hwThreadPool[id].threadId = obj->sibling_rank;
+        while (obj->type != HWLOC_OBJ_CORE) {
+            obj = obj->parent;
+            if (!obj)
+            {
+                skip = 1;
+                break;
+            }
+        }
+        if (skip)
+        {
+            hwThreadPool[id].coreId = 0;
+            hwThreadPool[id].packageId = 0;
+            continue;
+        }
+        hwThreadPool[id].coreId = obj->os_index;
+        while (obj->type != socket_type) {
+            obj = obj->parent;
+            if (!obj)
+            {
+                skip = 1;
+                break;
+            }
+        }
+        if (skip)
+        {
+            hwThreadPool[id].packageId = 0;
+            continue;
+        }
+        hwThreadPool[id].packageId = obj->os_index;
+        /*DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Thread Pool PU %d Thread %d Core %d Socket %d,
+                            hwThreadPool[threadIdx].apicId,
+                            hwThreadPool[threadIdx].threadId,
+                            hwThreadPool[threadIdx].coreId,
+                            hwThreadPool[threadIdx].packageId)*/
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+    }
+
+    cpuid_topology.threadPool = hwThreadPool;
+
+    return;
+}
+
+
+void hwloc_init_cacheTopology(void)
+{
+    int maxNumLevels=0;
+    int id=0;
+    CacheLevel* cachePool = NULL;
+    hwloc_obj_t obj;
+    int depth;
+    int d;
+    const char* info;
+
+    /* Sum up all depths with caches */
+    depth = likwid_hwloc_topology_get_depth(hwloc_topology);
+    for (d = 0; d < depth; d++)
+    {
+        if (likwid_hwloc_get_depth_type(hwloc_topology, d) == HWLOC_OBJ_CACHE)
+            maxNumLevels++;
+    }
+    cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+    /* Start at the bottom of the tree to get all cache levels in order */
+    depth = likwid_hwloc_topology_get_depth(hwloc_topology);
+    id = 0;
+    
+    for(d=depth-1;d >= 0; d--)
+    {
+        /* We only need caches, so skip other levels */
+        if (likwid_hwloc_get_depth_type(hwloc_topology, d) != HWLOC_OBJ_CACHE)
+        {
+            continue;
+        }
+        /* Get the cache object */
+        obj = likwid_hwloc_get_obj_by_depth(hwloc_topology, d, 0);
+        /* All caches have this attribute, so safe to access */
+        switch (obj->attr->cache.type)
+        {
+            case HWLOC_OBJ_CACHE_DATA:
+                cachePool[id].type = DATACACHE;
+                break;
+            case HWLOC_OBJ_CACHE_INSTRUCTION:
+                cachePool[id].type = INSTRUCTIONCACHE;
+                break;
+            case HWLOC_OBJ_CACHE_UNIFIED:
+                cachePool[id].type = UNIFIEDCACHE;
+                break;
+            default:
+                cachePool[id].type = NOCACHE;
+                break;
+        }
+
+        cachePool[id].associativity = obj->attr->cache.associativity;
+        cachePool[id].level = obj->attr->cache.depth;
+        cachePool[id].lineSize = obj->attr->cache.linesize;
+        cachePool[id].size = obj->attr->cache.size;
+        cachePool[id].sets = 0;
+        if ((cachePool[id].associativity * cachePool[id].lineSize) != 0)
+        {
+            cachePool[id].sets = cachePool[id].size /
+                (cachePool[id].associativity * cachePool[id].lineSize);
+        }
+
+        /* Count all HWThreads below the current cache */
+        cachePool[id].threads = likwid_hwloc_record_objs_of_type_below_obj(
+                        hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+
+        while (!(info = likwid_hwloc_obj_get_info_by_name(obj, "inclusiveness")) && obj->next_cousin)
+        {
+            obj = obj->next_cousin; // If some PU/core are not bindable because of cgroup, hwloc may not know the inclusiveness of some of their cache.
+        }
+        if(info)
+        {
+            cachePool[id].inclusive = info[0]=='t';
+        }
+        else
+        {
+            ERROR_PLAIN_PRINT(Processor is not supported);
+            break;
+        }
+        id++;
+    }
+
+    cpuid_topology.numCacheLevels = maxNumLevels;
+    cpuid_topology.cacheLevels = cachePool;
+    return;
+}
+
+void hwloc_close(void)
+{
+    if (hwloc_topology)
+    {
+        hwloc_topology_destroy(hwloc_topology);
+    }
+}
+
+#else
+
+void hwloc_init_cpuInfo(void)
+{
+    return;
+}
+
+void hwloc_init_cpuFeatures(void)
+{
+    return;
+}
+
+void hwloc_init_nodeTopology(void)
+{
+    return;
+}
+
+void hwloc_init_cacheTopology(void)
+{
+    return;
+}
+#endif
diff --git a/src/topology_proc.c b/src/topology_proc.c
new file mode 100644
index 0000000..1d3d0e0
--- /dev/null
+++ b/src/topology_proc.c
@@ -0,0 +1,626 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_proc.c
+ *
+ *      Description:  Interface to the procfs/sysfs based topology backend
+ *
+ *      Version:   4.1
+ *      Released:  19.5.2016
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <topology_proc.h>
+#include <cpuid.h>
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int get_cpu_perf_data(void)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    int largest_function = 0;
+    eax = 0x00;
+    CPUID(eax, ebx, ecx, edx);
+    largest_function = eax;
+    if (cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+    {
+        eax = 0x0A;
+        CPUID(eax, ebx, ecx, edx);
+        cpuid_info.perf_version   =  (eax&0xFFU);
+        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
+        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
+        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
+
+        eax = 0x06;
+        CPUID(eax, ebx, ecx, edx);
+        if (eax & (1<<1))
+        {
+            cpuid_info.turbo = 1;
+        }
+        else
+        {
+            cpuid_info.turbo = 0;
+        }
+    }
+    return 0;
+}
+
+int get_listPosition(int ownid, bstring list)
+{
+    bstring ownStr = bformat("%d",ownid);
+    struct bstrList* tokens = bsplit(list,(char) ',');
+    for(int i=0;i<tokens->qty;i++)
+    {
+        btrimws(tokens->entry[i]);
+        if (bstrcmp(ownStr, tokens->entry[i]) == BSTR_OK)
+        {
+            return i;
+        }
+    }
+    bstrListDestroy(tokens);
+    return -1;
+}
+
+int fillList(int* outList, int outOffset, bstring list)
+{
+    int current = 0;
+    int (*ownatoi)(const char*);
+    struct bstrList* tokens = bsplit(list,',');
+    ownatoi = &atoi;
+    for(int i=0;i<tokens->qty;i++)
+    {
+        btrimws(tokens->entry[i]);
+        if (bstrchrp(tokens->entry[i],'-',0) == BSTR_ERR)
+        {
+            if (outList)
+            {
+                outList[outOffset+current] = ownatoi(bdata(tokens->entry[i]));
+            }
+            current++;
+        }
+        else
+        {
+            struct bstrList* range = bsplit(tokens->entry[i],'-');
+            if (range->qty == 2)
+            {
+                for (int j=ownatoi(bdata(range->entry[0]));j<=ownatoi(bdata(range->entry[1]));j++)
+                {
+                    if (outList)
+                    {
+                        outList[outOffset+current] = j;
+                    }
+                    
+                    current++;
+                }
+            }
+            bstrListDestroy(range);
+        }
+    }
+    bstrListDestroy(tokens);
+    return current;
+}
+
+static int readCacheInclusiveIntel(int level)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    eax = 0x04;
+    ecx = level;
+    CPUID(eax, ebx, ecx, edx);
+    return edx & 0x2;
+}
+
+static int readCacheInclusiveAMD(int level)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    eax = 0x8000001D;
+    ecx = level;
+    CPUID(eax, ebx, ecx, edx);
+    return (edx & (0x1<<1));
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+void proc_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int i = 0;
+    int HWthreads = 0;
+    FILE *fp = NULL;
+
+    int (*ownatoi)(const char*);
+    char* (*ownstrcpy)(char*,const char*);
+    ownatoi = &atoi;
+    ownstrcpy = &strcpy;
+
+    const_bstring countString = bformat("processor\t:");
+    const_bstring modelString = bformat("model\t\t:");
+    const_bstring familyString = bformat("cpu family\t:");
+    const_bstring steppingString = bformat("stepping\t:");
+    const_bstring vendorString = bformat("vendor_id\t:");
+    const_bstring vendorIntelString = bformat("GenuineIntel");
+    const_bstring nameString = bformat("model name\t:");
+
+    cpuid_info.isIntel = 0;
+    cpuid_info.model = 0;
+    cpuid_info.family = 0;
+    cpuid_info.stepping = 0;
+    cpuid_topology.numHWThreads = 0;
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+
+    if (NULL != (fp = fopen ("/proc/cpuinfo", "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+        bdestroy(src);
+        fclose(fp);
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,countString) != BSTR_ERR)
+            {
+                HWthreads++;
+            }
+            else if ((cpuid_info.model == 0) && (binstr(tokens->entry[i],0,modelString) != BSTR_ERR))
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.model = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if ((cpuid_info.family == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.family = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,steppingString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.stepping = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,vendorString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                if (bstrcmp(subtokens->entry[1], vendorIntelString) == BSTR_OK)
+                {
+                    cpuid_info.isIntel = 1;
+                }
+            }
+        }
+        cpuid_topology.numHWThreads = HWthreads;
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads)
+    }
+    return;
+}
+
+void proc_init_cpuFeatures(void)
+{
+    int ret;
+    FILE* file;
+    char buf[1024];
+    char ident[30];
+    char delimiter[] = " ";
+    char* cptr;
+
+    if ( (file = fopen( "/proc/cpuinfo", "r")) == NULL )
+    {
+        fprintf(stderr, "Cannot open /proc/cpuinfo\n");
+        return;
+    }
+    ret = 0;
+    while( fgets(buf, sizeof(buf)-1, file) )
+    {
+        ret = sscanf(buf, "%s\t:", &(ident[0]));
+        if (ret != 1 || strcmp(ident,"flags") != 0)
+        {
+            continue;
+        }
+        else
+        {
+            ret = 1;
+            break;
+        }
+    }
+    fclose(file);
+    if (ret == 0)
+    {
+        return;
+    }
+
+    cpuid_info.featureFlags = 0;
+    cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+    cpuid_info.features[0] = '\0';
+    buf[strcspn(buf, "\n")] = '\0';
+    cptr = strtok(&(buf[6]),delimiter);
+
+    while (cptr != NULL)
+    {
+        if (strcmp(cptr,"ssse3") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSSE3);
+            strcat(cpuid_info.features, "SSSE3 ");
+        }
+        else if (strcmp(cptr,"sse3") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE3);
+            strcat(cpuid_info.features, "SSE3 ");
+        }
+        else if (strcmp(cptr,"monitor") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<MONITOR);
+            strcat(cpuid_info.features, "MONITOR ");
+        }
+        else if (strcmp(cptr,"mmx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<MMX);
+            strcat(cpuid_info.features, "MMX ");
+        }
+        else if (strcmp(cptr,"sse") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE);
+            strcat(cpuid_info.features, "SSE ");
+        }
+        else if (strcmp(cptr,"sse2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE2);
+            strcat(cpuid_info.features, "SSE2 ");
+        }
+        else if (strcmp(cptr,"acpi") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<ACPI);
+            strcat(cpuid_info.features, "ACPI ");
+        }
+        else if (strcmp(cptr,"rdtscp") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDTSCP);
+            strcat(cpuid_info.features, "RDTSCP ");
+        }
+        else if (strcmp(cptr,"vmx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<VMX);
+            strcat(cpuid_info.features, "VMX ");
+        }
+        else if (strcmp(cptr,"est") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<EIST);
+            strcat(cpuid_info.features, "EIST ");
+        }
+        else if (strcmp(cptr,"tm") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<TM);
+            strcat(cpuid_info.features, "TM ");
+        }
+        else if (strcmp(cptr,"tm2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<TM2);
+            strcat(cpuid_info.features, "TM2 ");
+        }
+        else if (strcmp(cptr,"aes") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AES);
+            strcat(cpuid_info.features, "AES ");
+        }
+        else if (strcmp(cptr,"rdrand") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDRAND);
+            strcat(cpuid_info.features, "RDRAND ");
+        }
+        else if (strcmp(cptr,"sse4_1") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE41);
+            strcat(cpuid_info.features, "SSE4.1 ");
+        }
+        else if (strcmp(cptr,"sse4_2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE42);
+            strcat(cpuid_info.features, "SSE4.2 ");
+        }
+        else if (strcmp(cptr,"avx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AVX);
+            strcat(cpuid_info.features, "AVX ");
+        }
+        else if (strcmp(cptr,"fma") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<FMA);
+            strcat(cpuid_info.features, "FMA ");
+        }
+        else if (strcmp(cptr,"avx2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AVX2);
+            strcat(cpuid_info.features, "AVX2 ");
+        }
+        else if (strcmp(cptr,"rtm") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RTM);
+            strcat(cpuid_info.features, "RTM ");
+        }
+        else if (strcmp(cptr,"hle") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<HLE);
+            strcat(cpuid_info.features, "HLE ");
+        }
+        else if (strcmp(cptr,"rdseed") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDSEED);
+            strcat(cpuid_info.features, "RDSEED ");
+        }
+        else if (strcmp(cptr,"ht") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<HTT);
+            strcat(cpuid_info.features, "HTT ");
+        }
+        cptr = strtok(NULL, delimiter);
+    }
+
+    if ((cpuid_info.featureFlags & (1<<SSSE3)) && !((cpuid_info.featureFlags) & (1<<SSE3)))
+    {
+        cpuid_info.featureFlags |= (1<<SSE3);
+        strcat(cpuid_info.features, "SSE3 ");
+    }
+
+    get_cpu_perf_data();
+    return;
+}
+
+
+
+void proc_init_nodeTopology(cpu_set_t cpuSet)
+{
+    HWThread* hwThreadPool;
+    FILE *fp;
+    bstring cpudir;
+    bstring file;
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        hwThreadPool[i].apicId = i;
+        hwThreadPool[i].threadId = -1;
+        hwThreadPool[i].coreId = -1;
+        hwThreadPool[i].packageId = -1;
+        hwThreadPool[i].inCpuSet = 1;
+        if (!CPU_ISSET(i, &cpuSet))
+        {
+            hwThreadPool[i].inCpuSet = 0;
+        }
+        cpudir = bformat("/sys/devices/system/cpu/cpu%d/topology",i);
+        file = bformat("%s/core_id", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].coreId = ownatoi(bdata(src));
+            fclose(fp);
+        }
+        bdestroy(file);
+        file = bformat("%s/physical_package_id", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].packageId = ownatoi(bdata(src));
+            fclose(fp);
+        }
+        bdestroy(file);
+        file = bformat("%s/thread_siblings_list", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].threadId = get_listPosition(i, src);
+            fclose(fp);
+        }
+        bdestroy(file);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC Thread Pool PU %d Thread %d Core %d Socket %d,
+                            hwThreadPool[i].apicId,
+                            hwThreadPool[i].threadId,
+                            hwThreadPool[i].coreId,
+                            hwThreadPool[i].packageId)
+        bdestroy(cpudir);
+    }
+    cpuid_topology.threadPool = hwThreadPool;
+    return;
+}
+
+void proc_init_cacheTopology(void)
+{
+    FILE *fp;
+    CacheLevel* cachePool = NULL;
+    int maxNumLevels = 0;
+    int nrCaches = 0;
+    bstring cpudir = bformat("/sys/devices/system/cpu/cpu0/cache");
+    bstring levelStr;
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+    for (int i=0;i<10;i++)
+    {
+        levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            int tmp = 0;
+            tmp = ownatoi(bdata(src));
+            if (tmp > maxNumLevels)
+            {
+                maxNumLevels = tmp;
+            }
+            nrCaches++;
+            fclose(fp);
+        }
+        else
+        {
+            bdestroy(levelStr);
+            break;
+        }
+        bdestroy(levelStr);
+    }
+
+    cachePool = (CacheLevel*) malloc(nrCaches * sizeof(CacheLevel));
+    for (int i=0;i<nrCaches;i++)
+    {
+        levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            cachePool[i].level = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/type",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring unifiedStr = bformat("Unified");
+            bstring dataStr = bformat("Data");
+            bstring intrStr = bformat("Instruction");
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            if (bstrcmp(dataStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = DATACACHE;
+            }
+            else if (bstrcmp(intrStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = INSTRUCTIONCACHE;
+            }
+            else if (bstrcmp(unifiedStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = UNIFIEDCACHE;
+            }
+            else
+            {
+                cachePool[i].type = NOCACHE;
+            }
+            fclose(fp);
+            bdestroy(unifiedStr);
+            bdestroy(dataStr);
+            bdestroy(intrStr);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/size",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            bdelete(src, blength(src)-1, 1);
+            cachePool[i].size = ownatoi(bdata(src)) * 1024;
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].size = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/ways_of_associativity",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].associativity = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].associativity = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/coherency_line_size",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].lineSize = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].lineSize = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/number_of_sets",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].sets = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            if ((cachePool[i].associativity * cachePool[i].lineSize) != 0)
+            {
+                cachePool[i].sets = cachePool[i].size /
+                    (cachePool[i].associativity * cachePool[i].lineSize);
+            }
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/shared_cpu_list",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].threads = fillList(NULL, 0, src);
+            fclose(fp);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+
+        switch ( cpuid_info.family )
+        {
+            case MIC_FAMILY:
+            case P6_FAMILY:
+                cachePool[i].inclusive = readCacheInclusiveIntel(cachePool[i].level);
+                break;
+            case K16_FAMILY:
+            case K15_FAMILY:
+                cachePool[i].inclusive = readCacheInclusiveAMD(cachePool[i].level);
+                break;
+            /* For K8 and K10 it is known that they are inclusive */
+            case K8_FAMILY:
+            case K10_FAMILY:
+                cachePool[i].inclusive = 1;
+                break;
+            default:
+                ERROR_PLAIN_PRINT(Processor is not supported);
+                break;
+        }
+    }
+    bdestroy(cpudir);
+    cpuid_topology.numCacheLevels = nrCaches;
+    cpuid_topology.cacheLevels = cachePool;
+    return;
+}
+
diff --git a/src/tree.c b/src/tree.c
index 795dd17..2ac8ab8 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Module implementing a tree data structure
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.1
+ *      Released:  19.5.2016
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,12 +34,35 @@
 #include <error.h>
 #include <tree.h>
 
+/* #####   FUNCTION DEFINITIONS  -  INTERNAL FUNCTIONS   ################## */
+void _tree_destroy(TreeNode* nodePtr)
+{
+    if (nodePtr == NULL)
+        return;
+    if (nodePtr->rlink)
+    {
+        _tree_destroy(nodePtr->rlink);
+        free(nodePtr->rlink);
+    }
+    if (nodePtr->llink)
+    {
+        _tree_destroy(nodePtr->llink);
+        free(nodePtr->llink);
+    }
+    return;
+}
+
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void
 tree_init(TreeNode** root, int id)
 {
     *root = (TreeNode*) malloc(sizeof(TreeNode));
+    if (!(*root))
+    {
+        *root = NULL;
+        return;
+    }
     (*root)->id = id;
     (*root)->llink = NULL;
     (*root)->rlink = NULL;
@@ -48,52 +71,71 @@ tree_init(TreeNode** root, int id)
 void
 tree_print(TreeNode* nodePtr)
 {
-    int level = 0;
+  int level = 0;
 
-    if (nodePtr != NULL)
+  if (nodePtr != NULL)
+  {
+
+    TreeNode* digger = NULL;
+    TreeNode* walker = NULL;
+
+    digger = nodePtr->llink;
+
+    while (digger != NULL)
     {
+      printf("\n Level %d:\n", level++);
+      printf("%d ", digger->id);
+      walker = digger->rlink;
 
-        TreeNode* digger;
-        TreeNode* walker;
+      while (walker != NULL)
+      {
+        printf("%d ", walker->id);
+        walker = walker->rlink;
+      }
 
-        digger = nodePtr->llink;
+      digger = digger->llink;
+    }
 
-        while (digger != NULL)
-        {
-            printf("\n Level %d:\n", level++);
-            printf("%d ", digger->id);
-            walker = digger->rlink;
+    printf("\n ");
+  }
+}
 
-            while (walker != NULL)
-            {
-            printf("%d ", walker->id);
-            walker = walker->rlink;
-            }
 
-            digger = digger->llink;
-        }
+void
+tree_destroy(TreeNode* nodePtr)
+{
 
-        printf("\n ");
+    if (nodePtr != NULL)
+    {
+        _tree_destroy(nodePtr);
+        free(nodePtr);
     }
 }
 
 void
 tree_insertNode(TreeNode* nodePtr, int id)
 {
-    TreeNode* currentNode;
-    TreeNode* tmpNode;
+    TreeNode* currentNode = NULL;
+    TreeNode* tmpNode = NULL;
+    TreeNode* newNode = NULL;
 
     if (nodePtr == NULL)
     {
         ERROR_PLAIN_PRINT(Node invalid);
     }
 
+    newNode = (TreeNode*) malloc(sizeof(TreeNode));
+    if (!newNode)
+    {
+        return;
+    }
+    newNode->id = id;
+    newNode->llink = NULL;
+    newNode->rlink = NULL;
+
     if (nodePtr->llink == NULL)
     {
-        nodePtr->llink = (TreeNode*) malloc(sizeof(TreeNode));
-        nodePtr->llink->id = id;
-        nodePtr->llink->llink = NULL;
-        nodePtr->llink->rlink = NULL;
+        nodePtr->llink = newNode;
     }
     else
     {
@@ -104,29 +146,21 @@ tree_insertNode(TreeNode* nodePtr, int id)
             if (id < currentNode->rlink->id)
             {
                 tmpNode = currentNode->rlink;
-                currentNode->rlink = (TreeNode*) malloc(sizeof(TreeNode));
-                currentNode->rlink->id = id;
-                currentNode->rlink->llink = NULL;
+                currentNode->rlink = newNode;
                 currentNode->rlink->rlink = tmpNode;
                 return;
             }
             currentNode = currentNode->rlink;
         }
 
-
         if (id > currentNode->id)
         {
-            currentNode->rlink = (TreeNode*) malloc(sizeof(TreeNode));
-            currentNode->rlink->id = id;
-            currentNode->rlink->llink = NULL;
-            currentNode->rlink->rlink = NULL;
+            currentNode->rlink = newNode;
         }
         else
         {
             tmpNode = currentNode;
-            nodePtr->llink = (TreeNode*) malloc(sizeof(TreeNode));
-            nodePtr->llink->id = id;
-            nodePtr->llink->llink = NULL;
+            nodePtr->llink = newNode;
             nodePtr->llink->rlink = tmpNode;
         }
     }
@@ -140,6 +174,7 @@ tree_nodeExists(TreeNode* nodePtr, int id)
     if (nodePtr == NULL)
     {
         ERROR_PLAIN_PRINT(Node invalid);
+        return 0;
     }
 
     walker = nodePtr->llink;
@@ -168,6 +203,7 @@ tree_countChildren(TreeNode* nodePtr)
     if (nodePtr == NULL)
     {
         ERROR_PLAIN_PRINT(Node invalid);
+        return 0;
     }
     if (nodePtr->llink == NULL)
     {
@@ -193,6 +229,7 @@ tree_getNode(TreeNode* nodePtr, int id)
     if (nodePtr == NULL)
     {
         ERROR_PLAIN_PRINT(Node invalid);
+        return NULL;
     }
     if (nodePtr->llink == NULL)
     {
@@ -222,6 +259,7 @@ tree_getChildNode(TreeNode* nodePtr)
     if (nodePtr == NULL)
     {
         ERROR_PLAIN_PRINT(Node invalid);
+        return NULL;
     }
     if (nodePtr->llink == NULL)
     {
diff --git a/test/MPI_pin_test.c b/test/MPI_pin_test.c
index 5624a95..f0e1271 100644
--- a/test/MPI_pin_test.c
+++ b/test/MPI_pin_test.c
@@ -1,15 +1,46 @@
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 #include <mpi.h>
+#include <sys/types.h>
+#include <string.h>
+#include <sys/syscall.h>
+
 #ifdef _OPENMP
 extern int omp_get_num_threads();
 extern int omp_get_thread_num();
 #endif
 
-#include <affinity.h>
+#include <sched.h>
+
+int get_cpu_id()
+{
+    int i;
+    int cpu_id = 0;
+    /* Get the the current process' stat file from the proc filesystem */
+    FILE* procfile = fopen("/proc/self/stat", "r");
+    long to_read = 8192;
+    char* line;
+    char buffer[to_read];
+    int read = fread(buffer, sizeof(char), to_read, procfile);
+    fclose(procfile);
+
+    // Field with index 38 (zero-based counting) is the one we want
+    line = strtok(buffer, " ");
+    for (i = 1; i < 38; i++)
+    {
+        line = strtok(NULL, " ");
+    }
+
+    line = strtok(NULL, " ");
+    cpu_id = atoi(line);
+    return cpu_id;
+}
 
+#define HOST_NAME_MAX 1024
 #define MASTER(msg) \
     if (rank == 0)  printf(#msg "\n")
+#define gettid() (int)syscall(SYS_gettid)
 
 main(int argc, char **argv)
 {
@@ -19,27 +50,31 @@ main(int argc, char **argv)
     MPI_Init(&argc,&argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     host = (char*) malloc(HOST_NAME_MAX * sizeof(char));
-    gethostname(host,HOST_NAME_MAX);
+    gethostname(host, HOST_NAME_MAX);
 
     MASTER(MPI started);
     MPI_Barrier(MPI_COMM_WORLD);
-    printf("Process with rank %d running on Node %s Core %d\n",rank ,host, likwid_getProcessorId());
-    fflush(stdout);
+    printf("Process with rank %d running on Node %s Core %d/%d\n",rank ,host, sched_getcpu(),get_cpu_id());
     MPI_Barrier(MPI_COMM_WORLD);
 
     MASTER(Enter OpenMP parallel region);
     MPI_Barrier(MPI_COMM_WORLD);
 #pragma omp parallel
     {
-        int coreId = likwid_getProcessorId();
+#pragma omp master
+        {
+            pid_t pid = getppid();
+            char cmd[1024];
+            sprintf(cmd, "pstree -p -H %d %d",pid, pid);
+            system(cmd);
+        }
 #pragma omp critical
         {
-            printf ("Rank %d Thread %d running on core %d \n",rank,omp_get_thread_num(), coreId);
-            fflush(stdout);
+            printf ("Rank %d Thread %d running on core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), sched_getcpu(),get_cpu_id(), getpid(),gettid());
         }
-    }
 
-    sleep(2);
+    }
 
+    free(host);
     MPI_Finalize();
 }
diff --git a/test/Makefile b/test/Makefile
index 56fece1..1209136 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,29 +1,66 @@
-LIKWID_LIB = -L../ -llikwid 
-INCLUDES = -I../src/includes -I../ -I../MIC
+include ../config.mk
 
-all:  testmarker testmarkerF90 stream streamM
+LIKWID_LIB ?= -L$(PREFIX)/lib -llikwid
+LIKWID_INCLUDE ?= -I$(PREFIX)/include
+LIKWID_DEFINES ?= -DLIKWID_PERFMON
 
-testmarkerF90: chaos.F90
-	ifort $(INCLUDES)  -O3  -o $@ chaos.F90 $(LIKWID_LIB) -lpthread
+all:  streamGCC
+
+GCC_C11_SUPPORT_MAJOR=$(shell /bin/bash -c "g++ -v 2>&1 | grep -o -E '([0-9])\.' | head -n 1 | tr -d '[:punct:]'")
+GCC_C11_SUPPORT_MINOR=$(shell /bin/bash -c "g++ -v 2>&1 | grep -o -E '\.([0-9])\.' | head -n 1 | tr -d '[:punct:]'")
+ICC_AVAILABLE=$(shell /bin/bash -c "which icc | wc -l")
+ICPC_AVAILABLE=$(shell /bin/bash -c "which icpc | wc -l")
+TBB_AVAILABLE=$(shell /bin/bash -c "ldconfig -v 2>/dev/null | grep libtbb.so | wc -l")
+
+streamGCC: stream.c
+	gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -fopenmp  -o $@  stream.c $(LIKWID_LIB) -lm
+
+streamAPIGCC: stream-API.c
+	gcc -O3 -std=c99 $(LIKWID_INCLUDE) -fopenmp -ftree-vectorize -ffast-math -o $@  stream-API.c $(LIKWID_LIB) -lm
+
+serial: serial.c
+	gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -o $@  serial.c $(LIKWID_LIB) -lm
+
+test-likwidAPI: test-likwidAPI.c
+	gcc -O3 -std=c99 $(LIKWID_INCLUDE) $(LIKWID_DEFINES) -o $@  test-likwidAPI.c $(LIKWID_LIB) -lm
+
+test-msr-access: test-msr-access.c
+	gcc -o $@  test-msr-access.c
 
-stream: stream.c
-	icc -O3 $(INCLUDES) -mmic  -openmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+streamICC: stream.c
+	if [ $(ICC_AVAILABLE) -ne 0 ]; then icc -O3 -xHost -std=c99 $(LIKWID_INCLUDES) -openmp  -o $@  $(LIKWID_DEFINES) stream.c $(LIKWID_LIB) -lm; fi
+
+streamGCC_C11: stream.cc
+	@if [ $(GCC_C11_SUPPORT_MAJOR) -eq 4  -a  $(GCC_C11_SUPPORT_MINOR) -gt 8 ]; then g++ -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+	@if [ $(GCC_C11_SUPPORT_MAJOR) -gt 4 ]; then g++ -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+
+streamICC_C11: stream.cc
+	@if [ $(ICPC_AVAILABLE) -ne 0 ]; then icpc -restrict -O3 -std=c++11 -pthread -o $@ $(LIKWID_DEFINES) stream.cc $(LIKWID_LIB) -lm; fi
+
+testmarker-cnt: testmarker-cnt.c
+	gcc -O3 -std=c99  $(LIKWID_INCLUDES) -fopenmp $(LIKWID_DEFINES) -o $@ testmarker-cnt.c $(LIKWID_LIB) -lm
+
+testmarker-omp: testmarker-omp.c
+	gcc -O3 -std=c99  $(LIKWID_INCLUDES) -fopenmp $(LIKWID_DEFINES) -o $@ testmarker-omp.c $(LIKWID_LIB)
+
+testmarkerF90: chaos.F90
+	ifort $(LIKWID_INCLUDES) $(LIKWID_DEFINES) -O3  -o $@ chaos.F90 $(LIKWID_LIB) -lpthread
 
-streamM: stream.c
-	gcc -O3 $(INCLUDES) -fopenmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+test-mpi: MPI_pin_test.c$
+	mpicc -O2 -fopenmp -D_GNU_SOURCE  -o $@ MPI_pin_test.c
 
-testmarker:
-	gcc -O3 -std=c99  $(INCLUDES) -fopenmp -DLIKWID_PERFMON  -o $@ testmarker-cnt.c $(LIKWID_LIB) -lm
+stream_cilk: stream_cilk.c
+	@if [ $(ICC_AVAILABLE) -ne 0 ]; then icc -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ stream_cilk.c $(LIKWID_LIB); fi
 
-testmarker-omp:
-	gcc -O3 -std=c99  $(INCLUDES) -openmp -DLIKWID_PERFMON  -o $@ testmarker-omp.c $(LIKWID_LIB)
+testTBBGCC:
+	@if [ $(TBB_AVAILABLE) -ne 0 ]; then g++ -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ testTBB.cc -ltbb $(LIKWID_LIB); fi
 
-test-mpi:
-	mpicc -DMAX_NUM_THREADS=128 -O2 -openmp -I../src/includes  -I../GCC -D_GNU_SOURCE  -o $@ MPI_pin_test.c $(LIKWID_LIB)
+testTBBICC:
+	@if [ $(TBB_AVAILABLE) -ne 0 -a $(ICPC_AVAILABLE) -ne 0 ]; then icpc -O3 $(LIKWID_DEFINES) $(LIKWID_INCLUDES) -o $@ testTBB.cc -ltbb $(LIKWID_LIB); else echo "Either TBB or ICPC missing"; fi
 
-.PHONY: clean
+.PHONY: clean streamGCC streamICC streamGCC_C11 streamICC_C11 testmarker-cnt testmarker-omp testmarkerF90 test-mpi stream_cilk serial test-likwidAPI streamAPIGCC test-msr-access testTBBGCC testTBBICC
 
-clean: 
-	rm -f stream streamM  testmarker testmarkerF90
+clean:
+	rm -f streamGCC streamICC streamGCC_C11 streamICC_C11 stream_cilk testmarker-cnt testmarkerF90 test-mpi testmarker-omp serial test-likwidAPI streamAPIGCC test-msr-access testTBBGCC testTBBICC
 
 
diff --git a/test/accuracy/Makefile b/test/accuracy/Makefile
index f84b1cd..0740b0c 100644
--- a/test/accuracy/Makefile
+++ b/test/accuracy/Makefile
@@ -1,25 +1,30 @@
 LIKWID_PATH=../..
+LIKWID_BENCH_PATH=../../bench
 LIKWID_APP=likwid-bench
 HOST=$(shell hostname -s)
 
 
-all: plain marker
-
-plain:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
-	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-plain
+all: clean marker localize_likwid
 
 marker:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
-	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+	@echo "===> Building instrumented likwid-bench"
+	@sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
+	@sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+	@cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+	@cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+
 papi:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
-	cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
-	sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
-	mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+	@echo "===> Building instrumented likwid-bench using PAPI API"
+	@sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+	@cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
+	@sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
+	@cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+	@cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
+	@mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+
+localize_likwid:
+	@cd $(LIKWID_PATH) && make local >/dev/null && cd - >/dev/null
+
+clean:
+	@echo "===> Cleaning old likwid-bench executables"
+	@rm -f $(LIKWID_APP)-plain $(LIKWID_APP)-marker $(LIKWID_APP)-papi
diff --git a/test/accuracy/README b/test/accuracy/README
index 9dd8a78..6baaa01 100644
--- a/test/accuracy/README
+++ b/test/accuracy/README
@@ -1,6 +1,6 @@
 LIKWID accuracy tester
 
-likwid-tester and likwid-tester-plot are test applications written in Perl. The likwid-accuracy.py application does the same but is written in Python.
+The likwid-accuracy.py application tests the accuracy of LIKWID's measurements. The tool is written in Python. The measurements are compared to an instrumented version of likwid-bench. By scaling the calculated likwid-bench results, it also takes write-allocates into account.
 
 Usage:
 make #build non-instrumentated and LIKWID-instrumentated versions of
@@ -9,10 +9,11 @@ Adjust test files in TESTS.
 Adjust test set file SET.txt or use the -s/--sets switch on commandline.
 likwid-accuracy.py #Runs the tests of all sets and saves results in folder RESULTS/<hostname>
 
+You should use some plotting option on the commandline.
+
 Options for likwid-accuracy.py:
 --pgf: Create a TeX file containing the definition of a PGF plot with suffix .tex -> .pdf
 --grace: Create grace batch file for further manipulation with XMgrace or create plot with gracebat .agr/.bat -> .png
 --gnuplot: Create GNUplot script .plot -> .jpg
---script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and gnuplot.
+--script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and/or gnuplot.
 --scriptname: Set name for Bash script, default is $CWD/create_plots.sh
---wiki/--only_wiki: Create a Wiki page for the Google Code Wiki including the .png pics found in Google Code Wiki picture path (http://<project>.googlecode.com/svn/wiki/images). 
diff --git a/test/accuracy/TESTS/BRANCH.txt b/test/accuracy/TESTS/BRANCH.txt
new file mode 100644
index 0000000..11efe50
--- /dev/null
+++ b/test/accuracy/TESTS/BRANCH.txt
@@ -0,0 +1,42 @@
+REGEX_BENCH NOTHING
+REGEX_PERF \|\s+Instructions per branch\s+\|\s+([0-9\.e\+\-]+)
+
+TEST load
+RUNS 5
+WA_FACTOR 11.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST triad
+RUNS 5
+WA_FACTOR 19.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST copy
+RUNS 5
+WA_FACTOR 11.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST stream
+RUNS 5
+WA_FACTOR 19.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST store
+RUNS 5
+WA_FACTOR 7.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
diff --git a/test/accuracy/TESTS/CLOCK.txt b/test/accuracy/TESTS/CLOCK.txt
new file mode 100644
index 0000000..3ee855c
--- /dev/null
+++ b/test/accuracy/TESTS/CLOCK.txt
@@ -0,0 +1,53 @@
+REGEX_BENCH Instructions:\s+([0-9]+)
+REGEX_PERF \|\s+INSTR_RETIRED_ANY\s+\|\s+FIXC0\s+\|\s+([0-9\.e\+\-]+)
+
+
+TEST daxpy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+
+TEST ddot
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST copy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST load
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST store
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST stream
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST triad
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
diff --git a/test/accuracy/TESTS/DATA.txt b/test/accuracy/TESTS/DATA.txt
new file mode 100644
index 0000000..454d10b
--- /dev/null
+++ b/test/accuracy/TESTS/DATA.txt
@@ -0,0 +1,34 @@
+REGEX_BENCH NOTHING
+REGEX_PERF \|\s+Load to store ratio\s+\|\s+([0-9\.e\+\-]+)
+
+TEST store
+RUNS 5
+WA_FACTOR 0.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST copy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST stream
+RUNS 5
+WA_FACTOR 2.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
+
+TEST triad
+RUNS 5
+WA_FACTOR 3.0
+VARIANT 12kB 20000
+VARIANT 1MB 10000
+VARIANT  4MB 7500
+VARIANT  1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_AVX.txt b/test/accuracy/TESTS/FLOPS_AVX.txt
index f5ce80e..7c2ea39 100644
--- a/test/accuracy/TESTS/FLOPS_AVX.txt
+++ b/test/accuracy/TESTS/FLOPS_AVX.txt
@@ -1,5 +1,13 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+DP MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Packed DP MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+
+
+TEST triad_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
 
 TEST stream_avx
 RUNS 10
@@ -8,10 +16,23 @@ VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
-TEST triad_avx
+TEST daxpy_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_avx
 RUNS 10
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
+TEST sum_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_DP.txt b/test/accuracy/TESTS/FLOPS_DP.txt
index da6f8be..810308b 100644
--- a/test/accuracy/TESTS/FLOPS_DP.txt
+++ b/test/accuracy/TESTS/FLOPS_DP.txt
@@ -1,5 +1,5 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
 
 TEST stream
 RUNS 10
@@ -9,6 +9,22 @@ VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
+TEST stream_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST stream_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
 TEST triad
 RUNS 10
 VARIANT 12kB 20000
@@ -17,3 +33,90 @@ VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
+TEST triad_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST triad_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST sum
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST sum_sse
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST sum_avx
+RUNS 10
+VARIANT 12kB 20000
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
diff --git a/test/accuracy/TESTS/FLOPS_SP.txt b/test/accuracy/TESTS/FLOPS_SP.txt
index 3bad7d7..72f2a62 100644
--- a/test/accuracy/TESTS/FLOPS_SP.txt
+++ b/test/accuracy/TESTS/FLOPS_SP.txt
@@ -1,5 +1,26 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFLOP\/s\s+\|\s+([0-9\.e\+\-]+)
+
+TEST sum_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST sum_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST sum_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
 
 TEST stream_sp
 RUNS 10
@@ -8,6 +29,20 @@ VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
+TEST stream_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST stream_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
 TEST triad_sp
 RUNS 10
 VARIANT 24kB 20000
@@ -15,3 +50,58 @@ VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
+TEST triad_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST triad_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST ddot_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy_sp
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy_sp_sse
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
+
+TEST daxpy_sp_avx
+RUNS 10
+VARIANT 24kB 20000
+VARIANT 128kB 10000
+VARIANT  2MB 5000
+VARIANT  1GB 50
diff --git a/test/accuracy/TESTS/HA.txt b/test/accuracy/TESTS/HA.txt
new file mode 100644
index 0000000..037c980
--- /dev/null
+++ b/test/accuracy/TESTS/HA.txt
@@ -0,0 +1,58 @@
+REGEX_BENCH MByte\/s:\s+([0-9]+)
+REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
+
+TEST load
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST store
+RUNS 10
+WA_FACTOR 2.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST copy
+RUNS 10
+WA_FACTOR 1.5
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST stream
+RUNS 10
+WA_FACTOR 1.3333
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST triad
+RUNS 10
+WA_FACTOR 1.25
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST daxpy
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
+
+TEST ddot
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 228863kB 1000
+VARIANT 435096kB 1000
+VARIANT 641329kB 1000
+VARIANT 847562kB 1000
diff --git a/test/accuracy/TESTS/L2.txt b/test/accuracy/TESTS/L2.txt
index 35b2bea..6924c89 100644
--- a/test/accuracy/TESTS/L2.txt
+++ b/test/accuracy/TESTS/L2.txt
@@ -1,38 +1,58 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
 
 TEST store
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 2.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
 
 TEST copy
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.5
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
 
 TEST stream
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.3333
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
 
 TEST triad
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.25
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
 
+TEST daxpy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
+
+TEST ddot
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 85kB 1000
+VARIANT 129kB 1000
+VARIANT 173kB 1000
+VARIANT 217kB 1000
diff --git a/test/accuracy/TESTS/L3.txt b/test/accuracy/TESTS/L3.txt
index 8ff6c62..a124cdb 100644
--- a/test/accuracy/TESTS/L3.txt
+++ b/test/accuracy/TESTS/L3.txt
@@ -1,38 +1,58 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
 
 TEST store
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 2000
-VARIANT  1GB 50
+WA_FACTOR 2.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
 
 TEST copy
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 2000
-VARIANT  1GB 50
+WA_FACTOR 1.5
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
 
 TEST stream
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 2000
-VARIANT  1GB 50
+WA_FACTOR 1.333
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
 
 TEST triad
 RUNS 5
-VARIANT 12kB 20000
-VARIANT 1MB 10000
-VARIANT  4MB 2000
-VARIANT  1GB 50
+WA_FACTOR 1.333
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
 
+TEST daxpy
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
+
+TEST ddot
+RUNS 5
+WA_FACTOR 1.0
+VARIANT 5810kB 1000
+VARIANT 11288kB 1000
+VARIANT 16766kB 1000
+VARIANT 22244kB 1000
diff --git a/test/accuracy/TESTS/MEM.txt b/test/accuracy/TESTS/MEM.txt
index 09993f6..71288a4 100644
--- a/test/accuracy/TESTS/MEM.txt
+++ b/test/accuracy/TESTS/MEM.txt
@@ -1,38 +1,58 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
 
 TEST store
 RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 7500
-VARIANT  1GB 50
+WA_FACTOR 2.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
 
 TEST copy
 RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.5
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
 
 TEST stream
 RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.3333
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
 
 TEST triad
 RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 7500
-VARIANT  1GB 50
+WA_FACTOR 1.25
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
 
+TEST daxpy
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
+
+TEST ddot
+RUNS 10
+WA_FACTOR 1.0
+VARIANT 218725kB 1000
+VARIANT 426801kB 1000
+VARIANT 634877kB 1000
+VARIANT 842953kB 1000
diff --git a/test/accuracy/TESTS/UOPS.txt b/test/accuracy/TESTS/UOPS.txt
new file mode 100644
index 0000000..1ebb4fe
--- /dev/null
+++ b/test/accuracy/TESTS/UOPS.txt
@@ -0,0 +1,30 @@
+REGEX_BENCH UOPs:\s+([0-9]+)
+REGEX_PERF \|\s+Retired UOPs\s+\|\s+([0-9\.e\+\-]+)
+
+TEST ddot
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST stream
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST daxpy
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
+
+TEST triad
+RUNS 5
+VARIANT 223231kB 1000
+VARIANT 430488kB 1000
+VARIANT 637745kB 1000
+VARIANT 845002kB 1000
diff --git a/test/accuracy/likwid-accuracy.py b/test/accuracy/likwid-accuracy.py
index 3d2d63c..916ed38 100755
--- a/test/accuracy/likwid-accuracy.py
+++ b/test/accuracy/likwid-accuracy.py
@@ -16,6 +16,8 @@ bench_marker = "./likwid-bench-marker"
 bench_papi = "./likwid-bench-papi"
 perfctr = "../../likwid-perfctr"
 topology = "../../likwid-topology"
+topology_name = re.compile("^CPU name:\s+(.*)")
+topology_stepping = re.compile("^CPU stepping:\s+(\d*)")
 topology_type = re.compile("^CPU type:\s+(.*)")
 topology_sockets = re.compile("^Sockets:\s+(\d+)")
 topology_corespersocket = re.compile("^Cores per socket:\s+(\d+)")
@@ -24,10 +26,18 @@ testlist = "SET.txt"
 testfolder = "TESTS"
 resultfolder = "RESULTS"
 hostname = socket.gethostname()
-picture_base = "http://likwid.googlecode.com/svn/wiki/images"
+picture_base = ".."
+topology_outputfile = "topology.dat"
+nrThreads = 1
 
-gnu_colors = ["red","blue","green"]#,"black","brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
-gnu_marks = [5,13,9]#,2,3,4,6,7,8,9,10,11,12,14,15]
+gnu_colors = ["red","blue","green","black"]#,"brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
+gnu_marks = [5,13,9,2]#,3,4,6,7,8,9,10,11,12,14,15]
+
+units = { "L2" : "MByte/s", "L3" : "MByte/s", "MEM" : "MByte/s", "HA" : "MByte/s",
+          "FLOPS_SP" : "MFLOP/s", "FLOPS_DP" : "MFLOP/s", "FLOPS_AVX" : "MFLOP/s",
+          "DATA": "Load/Store ratio", "BRANCH" : "Instructions per branch",
+          "CLOCK" : "Instructions", "UOPS" : "UOPs"}
+translate_group = {"CLOCK" : "INST_RETIRED_ANY", "UOPS" : "UOPS_RETIRED_ANY"}
 
 wiki = False
 papi = False
@@ -38,14 +48,28 @@ out_gnuplot = False
 out_grace = False
 scriptfilename = "create_plots.sh"
 out_script = False
+test_set = {}
+plain_set = {}
+corrected_set = {}
+marker_set = {}
+papi_set = {}
+
+if not os.path.exists(bench_marker):
+    print "Please run make before using likwid-accuracy.py"
+    sys.exit(1)
+if not os.path.exists(perfctr):
+    print "Cannot find likwid-perfctr"
+    sys.exit(1)
+
 
 def usage():
     print "Execute and evaluate accuracy tests for LIKWID with likwid-bench and likwid-perfctr"
     print
     print "-h/--help:\tPrint this help text"
     print "-s/--sets:\tSpecifiy testgroups (comma separated). Can also be set in SET.txt"
-    print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
-    print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+#    print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
+#    print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+    print "-c <nrThreads>:\tSet number of threads. The accuracy tool uses the E notation of likwid like E:N:<nrThreads>:1:2. Default is 1 thread."
     print "Picture options:"
     print "--pgf:\t\tCreate TeX document for each test with PGFPlot"
     print "--gnuplot:\tCreate GNUPlot script for each test"
@@ -53,30 +77,6 @@ def usage():
     print "--script:\tActivate recording of commands in a bash script"
     print "--scriptname:\tRecord commands to create pictures in file (default: %s)" % (os.path.join(os.path.join(resultfolder,hostname),scriptfilename))
 
-def get_system_info():
-    name = None
-    sockets = 0
-    corespersocket = 0
-    threadspercore = 0
-    
-    p = subprocess.Popen(topology, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    p.wait()
-    if p.returncode != 0:
-        name = "Unknown system"
-        return
-    for line in p.stdout.read().split("\n"):
-        if not line.strip() or line.startswith("*") or line.startswith("-"): continue
-        if line.startswith("CPU type"):
-            name = topology_type.match(line).group(1).strip()
-        if line.startswith("Sockets"):
-            sockets = int(topology_sockets.match(line).group(1))
-        if line.startswith("Cores per socket"):
-            corespersocket = int(topology_corespersocket.match(line).group(1))
-        if line.startswith("Threads per core"):
-            threadspercore = int(topology_threadspercore.match(line).group(1))
-        if name and sockets > 0 and corespersocket > 0 and threadspercore > 0:
-            break
-    return name, sockets, corespersocket, threadspercore
 
 def get_groups():
     groups = {}
@@ -87,8 +87,10 @@ def get_groups():
     for line in p.stdout.read().split("\n"):
         if line.startswith("-") or not line.strip(): continue
         if line.startswith("Available"): continue
-        name, description = line.split(":")
-        groups[name.strip()] = description.strip()
+        linelist = re.split("\s+", line.strip())
+        name = linelist[0]
+        description = " ".join(linelist[1:])
+        groups[name] = description
     return groups
 
 def get_test_groups(groupdict):
@@ -99,52 +101,103 @@ def get_test_groups(groupdict):
         setfp = open("SET.txt",'r')
         setlist = setfp.read().strip().split("\n")
         setfp.close()
-    
+
     filelist = glob.glob(testfolder+"/*.txt")
     for name in setlist:
-        tests = []
-        file = os.path.join(testfolder, name) + ".txt"
-        if not os.path.exists(file): continue
-        fp = open(file,'r')
-        finput = fp.read().strip().split("\n")
-        fp.close()    
-        for line in finput:
-            if line.startswith("TEST"):
-                tests.append(line.split(" ")[1])
-        groups[name] = tests
-                
-            
+        if name in get_groups():
+            tests = []
+            file = os.path.join(testfolder, name) + ".txt"
+            if not os.path.exists(file): continue
+            fp = open(file,'r')
+            finput = fp.read().strip().split("\n")
+            fp.close()
+            for line in finput:
+                if line.startswith("TEST"):
+                    tests.append(line.split(" ")[1])
+            groups[name] = tests
+
+
     return groups
-    
-def get_values_from_file(file, lineoffset, linecount):
-    results = []
-    fp = open(file,'r')
-    finput = fp.read().strip().split("\n")
-    fp.close()
+
+def write_topology(path):
+    try:
+        f = open(os.path.join(path, topology_outputfile),"w")
+    except:
+        print "Cannot write topology file %s" % (os.path.join(path, topology_outputfile),)
+        return
+    p = subprocess.Popen(topology, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    p.wait()
+    if p.returncode != 0:
+        return
+    f.write(p.stdout.read())
+    f.close()
+
+def approx(in1, in2):
+    if in1 > (0.95*in2) or in1 < (1.05*in2):
+        return 1
+    return 0
+
+def legend(file1, file2):
+    input1 = []
+    input2 = []
+    numbers1 = []
+    numbers2 = []
+    try:
+        f=open(file1,"r")
+        input1 = f.read().strip().split("\n")
+        f.close()
+    except:
+        print "Cannot open file "+file1
     try:
-        for line in finput[lineoffset:lineoffset+linecount]:
-            results.append(float(line.split(" ")[1]))
+        f=open(file2,"r")
+        input2 = f.read().strip().split("\n")
+        f.close()
     except:
-        print "Cannot read file %s from %d to %d" % (file, lineoffset,lineoffset+linecount, )
-        for line in finput[lineoffset:lineoffset+linecount]:
-            print line
-    return results
+        print "Cannot open file "+file2
+    if len(input1) == 0 and len(input2) == 0:
+        return "no"
+    for line in input1:
+        numbers1.append(line.split(" ")[1])
+    for line in input2:
+        numbers2.append(line.split(" ")[1])
+    if float(numbers1[0]) > float(numbers1[-1]) and float(numbers2[0]) > float(numbers2[-1]):
+        return "no"
+    elif float(numbers1[0]) < float(numbers1[-1]) and float(numbers2[0]) < float(numbers2[-1]):
+        return "so"
+    elif approx(float(numbers1[0]), float(numbers1[-1])) and approx(float(numbers2[0]), float(numbers2[-1])):
+        return "so"
+    return "no"
+
 
-def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
-    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".tex")
+def write_pgf(group, test, plain_file, marker_file, scale=0.0,papi_file=None, execute=False, script=None):
+    printgrp = group
+    if translate_group.has_key(group):
+        printgrp = translate_group[group]
+    filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".tex")
+    sizelist = []
+    sizeindex = []
+    lentry = "north east"
+    if legend(plain_file, marker_file) == "so":
+        lentry = "south east"
+    for i,variant in enumerate(test_set[group][test]["variants"]):
+        sizelist.append(variant)
+        sizeindex.append(str((i+0.5)*test_set[group][test]["RUNS"]))
     fp = open(filename,'w')
     fp.write("\documentclass{article}\n")
     fp.write("\usepackage{pgfplots}\n")
     fp.write("\\begin{document}\n")
     fp.write("% cut from here\n")
     fp.write("\\begin{tikzpicture}\n")
-    fp.write("\\begin{axis}[xlabel={Run}, ylabel={MFlops/s / MBytes/s},title={%s\_%s},legend pos=south east,xtick=data,width=.75\\textwidth]\n" % (group.replace("_","\_"),test.replace("_","\_"),))
+    fp.write("\\begin{axis}[xmin=0,xmax=%d,xlabel={Size - %d runs each}, ylabel={%s},title={Group %s - Test %s},legend pos=%s,xtick=data,width=.75\\textwidth,xticklabels={%s},xtick={%s}]\n" % (test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),test_set[group][test]["RUNS"],units[group],printgrp.replace("_","\_"),test.replace("_","\_"),lentry,",".join(sizelist),",".join(sizeindex)))
     fp.write("\\addplot+[red,mark=square*,mark options={draw=red, fill=red}] table {%s};\n" % (os.path.basename(plain_file),))
-    fp.write("\\addlegendentry{plain};\n")
-    fp.write("\\addplot+[blue,mark=diamond*,mark options={draw=blue, fill=blue}] table {%s};\n" % (os.path.basename(marker_file),))
-    fp.write("\\addlegendentry{marker};\n")
+    fp.write("\\addlegendentry{bench};\n")
+    if scale > 0.0:
+        fp.write("\\addplot+[blue,mark=*,mark options={draw=blue, fill=blue}] table[x index=0, y expr=\\thisrowno{1}*%f] {%s};\n" % (scale, os.path.basename(plain_file),))
+        fp.write("\\addlegendentry{scaled bench};\n")
+    fp.write("\\addplot+[green,mark=diamond*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(marker_file),))
+    fp.write("\\addlegendentry{perfctr};\n")
     if papi and papi_file:
-        fp.write("\\addplot+[green,mark=triangle*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(papi_file),))
+        fp.write("\\addplot+[black,mark=triangle*,mark options={draw=black, fill=black}] table {%s};\n" % (os.path.basename(papi_file),))
         fp.write("\\addlegendentry{papi};\n")
     fp.write("\\end{axis}\n")
     fp.write("\\end{tikzpicture}\n")
@@ -161,21 +214,36 @@ def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=Fals
     if script:
         script.write("pdflatex %s\n" % (os.path.basename(filename),))
     return filename
-    
-def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False, script=None):
-    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".plot")
+
+def write_gnuplot(group, test, plain_file, marker_file, scale = 1.0, papi_file=None, execute=False, script=None):
+    printgrp = group
+    if translate_group.has_key(group):
+        printgrp = translate_group[group]
+    filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".plot")
     fp = open(filename,'w')
     for i,color in enumerate(gnu_colors):
         fp.write("set style line %d linetype 1 linecolor rgb '%s' lw 2 pt %s\n" % (i+1, color,gnu_marks[i]))
     fp.write("set terminal jpeg\n")
-    fp.write("set title '%s_%s'\n" % (group, test,))
-    fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".jpg")),))
-    fp.write("set xlabel 'Run'\n")
-    fp.write("set ylabel 'MFlops/s / MBytes/s'\n")
-    #fp.write("set xtics 1\n")
-    plot_string = "plot '%s' using 1:2 title 'plain' with linespoints ls 1, \\\n '%s' using 1:2 title 'marker' with linespoints ls 2" % (os.path.basename(plain_file), os.path.basename(marker_file),)
+    fp.write("set encoding utf8\n")
+    fp.write("set title 'Group %s - Test %s'\n" % (printgrp, test,))
+    if legend(plain_file, marker_file) == "no":
+        fp.write("set key top right\n")
+    else:
+        fp.write("set key bottom right\n")
+    fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".jpg")),))
+    fp.write("set xlabel 'Size - %d runs each'\n" % (test_set[group][test]["RUNS"],))
+    fp.write("set ylabel '%s'\n" % (units[group],))
+    fp.write("set yrange  [0:]\n")
+    #fp.write("set xtics 0,%d,%d\n" % (test_set[group][test]["RUNS"], test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+    fp.write("set xtics %d\n" % (test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+    for i,variant in enumerate(test_set[group][test]["variants"]):
+        fp.write("set xtics add (\"%s\" %f)\n" % (variant, (i*test_set[group][test]["RUNS"])+(0.5*test_set[group][test]["RUNS"]),))
+    plot_string = "plot '%s' using 1:2 title 'bench' with linespoints ls 1, \\\n"  % (os.path.basename(plain_file),)
+    if scale > 0.0:
+        plot_string = plot_string+" '%s' using 1:($2*%f) title 'scaled bench' with linespoints ls 2, \\\n" % (os.path.basename(plain_file), scale,)
+    plot_string = plot_string+" '%s' using 1:2 title 'perfctr' with linespoints ls 3" % (os.path.basename(marker_file),)
     if papi and papi_file:
-        plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 3\n" % (os.path.basename(papi_file),)
+        plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 4\n" % (os.path.basename(papi_file),)
     fp.write(plot_string+"\n")
     fp.close()
     if execute:
@@ -190,31 +258,38 @@ def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False
         script.write("gnuplot %s\n" % (os.path.basename(filename),))
     return filename
 
-def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
-    filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".bat")
-    agrname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".agr")
-    pngname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".png")
+def write_grace(group, test, plain_file, correct_file, marker_file, papi_file=None, execute=False, script=None):
+    printgrp = group
+    if translate_group.has_key(group):
+        printgrp = translate_group[group]
+    filename = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".bat")
+    agrname = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".agr")
+    pngname = os.path.join(os.path.join(resultfolder,hostname),printgrp+"_"+test+".png")
     if execute or script:
         plain_file = os.path.basename(plain_file)
         marker_file = os.path.basename(marker_file)
+        correct_file = os.path.basename(correct_file)
         if papi_file: papi_file = os.path.basename(papi_file)
         pngname = os.path.basename(pngname)
         agrname = os.path.basename(agrname)
-    cmd_options = "-autoscale xy -nxy %s -nxy %s "% (plain_file,marker_file,)
+    cmd_options = "-autoscale xy -nxy %s -nxy %s -nxy %s " % (plain_file, correct_file, marker_file,)
     if papi and papi_file:
         cmd_options += "-nxy %s " % (papi_file,)
     out_options = "-hdevice PNG -printfile %s " % (pngname,)
     out_options += "-saveall %s" % (agrname,)
     fp = open(filename,'w')
-    fp.write("title \"%s_%s\"\n" % (group, test,))
+    fp.write("title \"Group %s - Test %s\"\n" % (printgrp, test,))
     fp.write("xaxis label \"Run\"\n")
     fp.write("xaxis label char size 1.2\n")
-    fp.write("xaxis ticklabel char size 1.2\n")
-    fp.write("yaxis label \"MFlops/s / MBytes/s\"\n")
+    fp.write("xaxis ticklabel char size 1.2\n" % (units[group],))
+    fp.write("yaxis label \"%s\"\n")
     fp.write("yaxis label char size 1.2\n")
     fp.write("yaxis ticklabel char size 1.2\n")
-    fp.write("legend 0.8,0.7\n")
-    fp.write("s0 legend \"plain\"\n")
+    if legend(plain_file, marker_file) == "no":
+        fp.write("legend 0.8,0.7\n")
+    else:
+        fp.write("legend 0.2,0.7\n")
+    fp.write("s0 legend \"bench\"\n")
     fp.write("s0 symbol 2\n")
     fp.write("s0 symbol size 1\n")
     fp.write("s0 symbol color 2\n")
@@ -228,7 +303,7 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
     fp.write("s0 line linestyle 1\n")
     fp.write("s0 line linewidth 2\n")
     fp.write("s0 line pattern 1\n")
-    fp.write("s1 legend \"marker\"\n")
+    fp.write("s1 legend \"scaled bench\"\n")
     fp.write("s1 symbol 3\n")
     fp.write("s1 symbol size 1\n")
     fp.write("s1 symbol color 4\n")
@@ -242,21 +317,35 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
     fp.write("s1 line linestyle 1\n")
     fp.write("s1 line linewidth 2\n")
     fp.write("s1 line pattern 1\n")
+    fp.write("s2 legend \"perfctr\"\n")
+    fp.write("s2 symbol 4\n")
+    fp.write("s2 symbol size 1\n")
+    fp.write("s2 symbol color 3\n")
+    fp.write("s2 symbol pattern 1\n")
+    fp.write("s2 symbol fill color 3\n")
+    fp.write("s2 symbol fill pattern 1\n")
+    fp.write("s2 symbol linewidth 2\n")
+    fp.write("s2 symbol linestyle 1\n")
+    fp.write("s2 line type 1\n")
+    fp.write("s2 line color 3\n")
+    fp.write("s2 line linestyle 1\n")
+    fp.write("s2 line linewidth 2\n")
+    fp.write("s2 line pattern 1\n")
     if papi and papi_file:
-        fp.write("s2 legend \"papi\"\n")
-        fp.write("s2 symbol 4\n")
-        fp.write("s2 symbol size 1\n")
-        fp.write("s2 symbol color 3\n")
-        fp.write("s2 symbol pattern 1\n")
-        fp.write("s2 symbol fill color 3\n")
-        fp.write("s2 symbol fill pattern 1\n")
-        fp.write("s2 symbol linewidth 2\n")
-        fp.write("s2 symbol linestyle 1\n")
-        fp.write("s2 line type 1\n")
-        fp.write("s2 line color 3\n")
-        fp.write("s2 line linestyle 1\n")
-        fp.write("s2 line linewidth 2\n")
-        fp.write("s2 line pattern 1\n")
+        fp.write("s3 legend \"papi\"\n")
+        fp.write("s3 symbol 5\n")
+        fp.write("s3 symbol size 1\n")
+        fp.write("s3 symbol color \"black\"\n")
+        fp.write("s3 symbol pattern 1\n")
+        fp.write("s3 symbol fill color \"black\"\n")
+        fp.write("s3 symbol fill pattern 1\n")
+        fp.write("s3 symbol linewidth 2\n")
+        fp.write("s3 symbol linestyle 1\n")
+        fp.write("s3 line type 1\n")
+        fp.write("s3 line color \"black\"\n")
+        fp.write("s3 line linestyle 1\n")
+        fp.write("s3 line linewidth 2\n")
+        fp.write("s3 line pattern 1\n")
     fp.close()
     if execute:
         cmd = "cd %s && gracebat %s -param %s %s && cd -" % (os.path.dirname(filename), cmd_options, os.path.basename(filename),out_options,)
@@ -269,8 +358,10 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
         script.write("gracebat %s -param %s %s\n" % (cmd_options, os.path.basename(filename),out_options,))
     return filename
 
+
+
 try:
-    opts, args = getopt.getopt(sys.argv[1:], "hs:", ["help", "sets=","script","scriptname=","wiki","only_wiki","pgf","gnuplot","grace","papi"])
+    opts, args = getopt.getopt(sys.argv[1:], "hs:c:", ["help", "sets=","script","scriptname=","wiki","only_wiki=","pgf","gnuplot","grace","papi"])
 except getopt.GetoptError as err:
     print str(err)
     usage()
@@ -288,8 +379,15 @@ for o, a in opts:
         wiki = True
     if o == "--only_wiki":
         only_wiki = True
+        hostname = a
     if o == "--papi":
         papi = True
+    if o == "-c":
+        try:
+            nrThreads = int(a)
+        except:
+            print "Argument to -c not valid. Must be a number"
+            sys.exit(1)
     if o == "--pgf":
         out_pgf = True
     if o == "--gnuplot":
@@ -303,55 +401,74 @@ for o, a in opts:
     if o == "--scriptname":
         scriptfilename = a
 
-if not os.path.exists(testlist):
+if len(sets) == 0 and not os.path.exists(testlist):
     print "Cannot find file %s containing list of testgroups" % (testlist,)
     sys.exit(1)
 if not os.path.exists(testfolder):
     print "Cannot find folder %s containing the testgroups" % (testfolder,)
     sys.exit(1)
 
-test_set = {}
-plain_set = {}
-marker_set = {}
-papi_set = {}
-fp = open(testlist,'r')
-for line in fp.read().split("\n"):
+
+if len(sets) == 0:
+    fp = open(testlist,'r')
+    tmp = fp.read().split("\n")
+    for item in tmp:
+        if not item.strip() or item.startswith("#"): continue
+        sets.append(item)
+    fp.close()
+for line in sets:
     if not line.strip() or line.startswith("#"): continue
-    if os.path.exists("%s/%s.txt" % (testfolder,line.strip(),)):
-        test_set[line.strip()] = {}
-        plain_set[line.strip()] = {}
-        marker_set[line.strip()] = {}
-        papi_set[line.strip()] = {}
-        testfp = open("%s/%s.txt" % (testfolder,line.strip(),),'r')
+    filename = "%s/%s.txt" % (testfolder,line.strip(),)
+    if os.path.exists(filename):
+        groupname = line.strip()
+        testfp = open(filename,'r')
+        for line in testfp.read().split("\n"):
+            if line.startswith("GROUP"):
+                match = re.match("^GROUP\s+(\.+)")
+                if match:
+                    groupname = match.group(1)
+                    break
+        testfp.close()
+        test_set[groupname] = {}
+        plain_set[groupname] = {}
+        corrected_set[groupname] = {}
+        marker_set[groupname] = {}
+        papi_set[groupname] = {}
+        testfp = open(filename,'r')
         test = None
         for i,testline in enumerate(testfp.read().split("\n")):
             if test and not testline.strip(): test = None
             if testline.startswith("REGEX_BENCH"):
-                test_set[line.strip()]["REGEX_BENCH"] = re.compile(" ".join(testline.split(" ")[1:]))
+                test_set[groupname]["REGEX_BENCH"] = re.compile(" ".join(testline.split(" ")[1:]))
             if testline.startswith("REGEX_PERF"):
-                test_set[line.strip()]["REGEX_PERF"] = re.compile(" ".join(testline.split(" ")[1:]))
+                test_set[groupname]["REGEX_PERF"] = re.compile(" ".join(testline.split(" ")[1:]))
             if testline.startswith("REGEX_PAPI"):
-                test_set[line.strip()]["REGEX_PAPI"] = re.compile(" ".join(testline.split(" ")[1:]))
+                test_set[groupname]["REGEX_PAPI"] = re.compile(" ".join(testline.split(" ")[1:]))
             if testline.startswith("TEST"):
                 test = testline.split(" ")[1]
-                test_set[line.strip()][test] = {}
-                plain_set[line.strip()][test] = {}
-                marker_set[line.strip()][test] = {}
-                papi_set[line.strip()][test] = {}
+                test_set[groupname][test] = {}
+                test_set[groupname][test]["WA_FACTOR"] = 0.0
+                plain_set[groupname][test] = {}
+                corrected_set[groupname][test] = {}
+                marker_set[groupname][test] = {}
+                papi_set[groupname][test] = {}
             if testline.startswith("RUNS") and test:
-                test_set[line.strip()][test]["RUNS"] = int(testline.split(" ")[1])
+                test_set[groupname][test]["RUNS"] = int(testline.split(" ")[1])
+            if testline.startswith("WA_FACTOR") and test:
+                test_set[groupname][test]["WA_FACTOR"] = float(testline.split(" ")[1])
             if testline.startswith("VARIANT") and test:
                 linelist = re.split("\s+",testline);
                 variant = linelist[1]
-                if not test_set[line.strip()][test].has_key("variants"):
-                    test_set[line.strip()][test]["variants"] = []
-                test_set[line.strip()][test][variant] = linelist[2]
-                test_set[line.strip()][test]["variants"].append(linelist[1])
-                plain_set[line.strip()][test][variant] = []
-                marker_set[line.strip()][test][variant] = []
-                papi_set[line.strip()][test][variant] = []
+                if not test_set[groupname][test].has_key("variants"):
+                    test_set[groupname][test]["variants"] = []
+                test_set[groupname][test][variant] = linelist[2]
+                test_set[groupname][test]["variants"].append(linelist[1])
+                plain_set[groupname][test][variant] = []
+                corrected_set[groupname][test][variant] = []
+                marker_set[groupname][test][variant] = []
+                papi_set[groupname][test][variant] = []
         testfp.close()
-fp.close()
+
 
 
 if len(test_set.keys()) == 0:
@@ -362,22 +479,24 @@ if not os.path.exists(resultfolder):
     os.mkdir(resultfolder)
 if not os.path.exists(os.path.join(resultfolder,hostname)):
     os.mkdir(os.path.join(resultfolder,hostname))
-
+write_topology(os.path.join(resultfolder,hostname))
 if not only_wiki:
     scriptfile = os.path.join(os.path.join(resultfolder,hostname),scriptfilename)
     script = open(scriptfile,'w')
     script.write("#!/bin/bash\n")
 
     for group in test_set.keys():
-        perfctr_string = "%s -c S0:0 -g %s -m " % (perfctr,group,)
+        perfctr_string = "%s -C E:N:%d:1:2 -g %s -m " % (perfctr,nrThreads, group,)
+        no_scale = False
         for test in test_set[group].keys():
             if test.startswith("REGEX"): continue
             file_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
             raw_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.raw")
+            file_correct = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_correct.dat")
             file_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
             raw_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.raw")
             outfp_plain = open(file_plain,'w')
-            rawfp_plain = open(raw_plain,'w')
+            outfp_correct = open(file_correct,'w')
             outfp_marker = open(file_marker,'w')
             rawfp_marker = open(raw_marker,'w')
             if papi:
@@ -389,44 +508,17 @@ if not only_wiki:
                 file_papi = None
                 raw_papi = None
             counter = 1
+            print "Group %s Test %s" % (group, test,)
             for size in test_set[group][test]["variants"]:
                 if size.startswith("RUNS"): continue
-                bench_options = "-t %s -i %s -g 1 -w N:%s:1" % (test, test_set[group][test][size], size,)
+                print "Size "+size+": ",
+                bench_options = "-t %s -w N:%s:%d" % (test, size, nrThreads)
                 for i in range(0,test_set[group][test]["RUNS"]):
-                    # Run with plain likwid-bench
-                    p = subprocess.Popen(bench_plain+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
-                    try:
-                        p.wait()
-                        stdout = p.stdout.read()
-                        p.stdout.close()
-                    except:
-                        sys.exit(1)
-                    for line in stdout.split("\n"):
-                        if p.returncode != 0: print line
-                        match = test_set[group]["REGEX_BENCH"].match(line)
-                        if match:
-                            plain_set[group][test][size].append(match.group(1))
-                            outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
-                        rawfp_plain.write(line+"\n")
-                    # Run with papi instrumented likwid-bench
-                    if papi:
-                        os.environ["PAPI_BENCH"] = str(group)
-                        p = subprocess.Popen(bench_papi+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
-                        try:
-                            p.wait()
-                            stdout = p.stdout.read()
-                            p.stdout.close()
-                        except:
-                            sys.exit(1)
-                        for line in stdout.split("\n"):
-                            if p.returncode != 0: print line
-                            match = test_set[group]["REGEX_PAPI"].match(line)
-                            if match:
-                                papi_set[group][test][size].append(match.group(1))
-                                outfp_papi.write(str(counter)+" "+match.group(1)+"\n")
-                            rawfp_papi.write(line+"\n")
+                    print "*",
+                    sys.stdout.flush()
                     # Run with LIKWID instrumented likwid-bench and likwid-perfctr
-                    p = subprocess.Popen(perfctr_string+" "+bench_marker+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+                    rawfp_marker.write(perfctr_string+" "+bench_marker+" "+bench_options+"\n")
+                    p = subprocess.Popen(perfctr_string+" "+bench_marker+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT,executable="/bin/bash")
                     stdout = ""
                     try:
                         p.wait()
@@ -434,100 +526,50 @@ if not only_wiki:
                         p.stdout.close()
                     except:
                         sys.exit(1)
+                    found_bench = False
+                    found_perfctr = False
                     for line in stdout.split("\n"):
-                        if p.returncode != 0: print line
-                        match = test_set[group]["REGEX_PERF"].match(line)
-                        if match:
-                            marker_set[group][test][size].append(float(match.group(1)))
-                            outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
+                        if p.returncode == 0:
+                            match = test_set[group]["REGEX_PERF"].match(line)
+                            if match:
+                                marker_set[group][test][size].append(float(match.group(1)))
+                                outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
+                                found_perfctr = True
+                            match = test_set[group]["REGEX_BENCH"].match(line)
+                            if match:
+                                found_bench = True
+                                value = float(match.group(1)) * test_set[group][test]["WA_FACTOR"]
+                                plain_set[group][test][size].append(match.group(1))
+                                corrected_set[group][test][size].append(str(value))
+                                outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
+                                outfp_correct.write(str(counter)+" "+str(value)+"\n")
                         rawfp_marker.write(line+"\n")
+                    if not found_bench:
+                        value = str(test_set[group][test]["WA_FACTOR"])
+                        plain_set[group][test][size].append(value)
+                        corrected_set[group][test][size].append(value)
+                        outfp_plain.write(str(counter)+" "+value+"\n")
+                        outfp_correct.write(str(counter)+" "+value+"\n")
+                        no_scale = True
+                    if not found_perfctr:
+                        marker_set[group][test][size].append(0)
+                        outfp_marker.write(str(counter)+" "+str(0)+"\n")
                     counter += 1
+                print("")
             outfp_plain.close()
-            rawfp_plain.close()
+            outfp_correct.close()
             outfp_marker.close()
             rawfp_marker.close()
             if papi:
                 outfp_papi.close()
                 rawfp_papi.close()
-            if out_pgf: pgf_file = write_pgf(group, test, file_plain, file_marker, file_papi, script=script)
-            if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain, file_marker, file_papi, script=script)
-            if out_grace: grace_file = write_grace(group, test, file_plain, file_marker, file_papi, script=script)
+            if no_scale:
+                test_set[group][test]["WA_FACTOR"] = 0.0
+            if out_pgf:
+                pgf_file = write_pgf(group, test, file_plain, file_marker, test_set[group][test]["WA_FACTOR"],file_papi, script=script)
+            if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain,file_marker, test_set[group][test]["WA_FACTOR"], file_papi, script=script)
+            if out_grace: grace_file = write_grace(group, test, file_plain, file_correct, file_marker, file_papi, script=script)
 
 
     script.close()
     os.chmod(scriptfile, stat.S_IRWXU)
-#if only_wiki:
-#    for group in test_set.keys():
-#        for test in test_set[group].keys():
-#            if test.startswith("REGEX"): continue
-#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
-#            for i,size in enumerate(test_set[group][test]["variants"]):
-#                start = i*test_set[group][test]["RUNS"]
-#                end = (i+1)*test_set[group][test]["RUNS"]
-#                runs = test_set[group][test]["RUNS"]
-#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-#                plain_set[group][test][size] = get_values_from_file(filename, start, runs)
-#                if len(plain_set[group][test][size]) == 0: plain_set[group][test][size].append(0)
-#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
-#            for i,size in enumerate(test_set[group][test]["variants"]):
-#                start = i*test_set[group][test]["RUNS"]
-#                end = (i+1)*test_set[group][test]["RUNS"]
-#                runs = test_set[group][test]["RUNS"]
-#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-#                marker_set[group][test][size] = get_values_from_file(filename, start, runs)
-#                if len(marker_set[group][test][size]) == 0: marker_set[group][test][size].append(0)
-
-
-if wiki or only_wiki:
-    name, sockets, corespersocket, threadspercore = get_system_info();
-    groups = get_groups()
-    testable_groups = get_test_groups(groups)
-    #print groups
-    #print testable_groups
-    #if testable_groups.has_key("FLOPS_DP"): del testable_groups["FLOPS_DP"]
-
-    print "#summary Accuracy Tests for %s\n" % (name,)
-    print "= Hardware description ="
-    print "Sockets: %d<br>" % (sockets,)
-    print "Cores per socket: %d<br>" % (corespersocket,)
-    print "Threads per core: %d<br>" % (threadspercore,)
-    print "Total number of processing units: %d<br>" % (sockets * corespersocket * threadspercore)
-    print
-    print "= Available groups ="
-    print "Each architecture defines a different set of groups. Here all the groups available for the %s are listed:<br>" % (name,)
-    for grp in groups.keys():
-        print "%s: %s<br>" % (grp, groups[grp],)
-    print
-    print "= Available verification tests ="
-    print "Not all groups can be tested for accuracy. Here only the groups are listed that can be verified. Each group is followed by the low-level benchmarks that are performed for comparison.<br>"
-    #print testable_groups
-    for grp in testable_groups.keys():
-        print "%s: %s<br>" % (grp, ", ".join (testable_groups[grp]))
-    print
-    print "= Accuracy comparison ="
-    print "For each varification group, the tests are performed twice. Once in a plain manner without measuring but calculating the resulting values and once through an instumented code with LIKWID.<br>"
-    
-    
-    for grp in testable_groups.keys():
-        print "== Verification of Group %s ==" % (grp,)
-        for test in testable_groups[grp]:
-            #print grp, test, test_set[grp][test]
-            print "=== Verification of Group %s with Test %s ===" % (grp, test,)
-            print "|| *Stream size* || *Iterations* ||"
-            for variant in test_set[grp][test]["variants"]:
-                print "|| %s || %s ||" % (variant, test_set[grp][test][variant], )
-            print 
-            print "Each data size is tested %d times, hence the first %d entries on the x-axis correspond to the %d runs for the first data size of %s and so on.<br>" % (test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["variants"][0],)
-            print "%s/accuracy/%s/%s_%s.png" % (picture_base,hostname, grp, test,)
-            print
-            file_plain = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_plain.dat")
-            file_marker = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_marker.dat")
-            print "|| Variant || Plain (Min) || LIKWID (Min) || Plain (Max) || LIKWID (Max) || Plain (Avg) || LIKWID (Avg) ||"
-            for i, variant in enumerate(test_set[grp][test]["variants"]):
-                results_plain = get_values_from_file(file_plain, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
-                results_marker = get_values_from_file(file_marker, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
-                 if results_plain == []: results_plain.append(0)
-                 if results_marker == []: results_marker.append(0)
-                 print "|| %s || %d || %d || %d || %d || %d || %d ||" % (variant, min(results_plain), min(results_marker), max(results_plain), max(results_marker), int(statistics.mean(results_plain)), int(statistics.mean(results_marker)),)
-            print
-            print
diff --git a/test/accuracy/likwid-adjust-test-sizes.py b/test/accuracy/likwid-adjust-test-sizes.py
new file mode 100755
index 0000000..0deb5dd
--- /dev/null
+++ b/test/accuracy/likwid-adjust-test-sizes.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+
+import os, sys, os.path, re, subprocess
+
+topology_exec = "../../likwid-topology"
+topology_re_size = re.compile("^Size:\s+(.*)")
+re_size_unit = re.compile("(\d+)\s(\w+)")
+
+cachesizes = []
+
+def get_caches():
+    level = 0
+    p = subprocess.Popen(topology_exec, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    p.wait()
+    if p.returncode != 0:
+        return level
+    for line in p.stdout.read().split("\n"):
+        if line.startswith("Size:"):
+            string = topology_re_size.match(line).group(1).strip()
+            size, unit = re_size_unit.match(string).groups()
+            if unit == "kB":
+                size = int(size)*1024
+            elif unit == "MB":
+                size = int(size)*1024*1024
+            cachesizes.append(size)
+            level += 1
+    fp = open("/proc/meminfo")
+    f = fp.read().strip().split("\n")
+    fp.close()
+    for line in f:
+        if line.startswith("MemTotal:"):
+            linelist = re.split("\s+", line)
+            size = int(linelist[1])
+            if linelist[2] == "kB":
+                size *= 1024
+            elif linelist[2] == "MB":
+                size *= 1024*1024
+            if size > 1024*1024*1024:
+                size = 1024*1024*1024
+            cachesizes.append(size)
+    return level
+
+def get_important_tests():
+    important = ["L2", "L3", "MEM", "CLOCK", "UOPS"]
+    adjust = []
+    regular = []
+    fp = open("SET.txt")
+    f = fp.read().strip().split("\n")
+    fp.close()
+    for line in f:
+        found = False
+        for imp in important:
+            if imp in line:
+                adjust.append(line)
+                found = True
+        if not found:
+            regular.append(line)
+    return adjust, regular
+    
+def adjust_tests(testgroup):
+    fp = open("TESTS/"+testgroup+".txt", "r")
+    f = fp.read().strip().split("\n")
+    fp.close()
+    newdata = []
+    level = re.match("L(\d+)", testgroup)
+    if level:
+        level = int(level.group(1))-1
+    else:
+        level = len(cachesizes)-1
+    min_size = int((cachesizes[level-1] + (0.3*cachesizes[level-1]))/1024)
+    max_size = int((cachesizes[level] - (0.2*cachesizes[level]))/1024)
+    diff = (cachesizes[level] - cachesizes[level-1])/1024
+    step = diff/5
+    i = 0
+    while i < len(f):
+        if not f[i].startswith("VARIANT"):
+            newdata.append(f[i]+"\n")
+            i+=1
+        else:
+            count = 0
+            for j in range(i,i+4):
+                if f[j].startswith("VARIANT"):
+                    count += 1
+                else: break
+            i += count
+            newdata.append("VARIANT %dkB 1000\n" % (int(min_size+step),))
+            newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(2*step)),))
+            newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(3*step)),))
+            newdata.append("VARIANT %dkB 1000\n" % (int(min_size+(4*step)),))
+    fp = open("TESTS/"+testgroup+".txt", "w")
+    for line in newdata:
+        fp.write(line)
+    fp.close()
+
+level = get_caches()
+adjust, regular = get_important_tests()
+for testgroup in adjust:
+    print("Adjusting "+testgroup)
+    adjust_tests(testgroup)
+if len(regular) > 0:
+    print("Not adjusting:")
+    print(regular)
+
+
diff --git a/test/accuracy/likwid-tester b/test/accuracy/likwid-tester
deleted file mode 100755
index ea264ae..0000000
--- a/test/accuracy/likwid-tester
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-use Data::Dumper;
-use File::Copy;
-
-my $domain = 'S0';
-
-my $hostname = `hostname`;
-chomp $hostname;
-my %GROUPS;
-my $TEST_ROOT = abs_path('./');
-my $RESULT_DIR   =  "$TEST_ROOT/RESULTS/$hostname";
-my $LIKWID_ROOT  =  "$TEST_ROOT/../..";
-my $PERFCTR      =  "$LIKWID_ROOT/likwid-perfctr";
-my $BENCH_PLAIN  =  "$LIKWID_ROOT/likwid-bench-plain";
-my $BENCH_MARKER =  "$LIKWID_ROOT/likwid-bench-marker";
-
-sub extract_result 
-{
-	my $type = shift;
-    my $REGEX;
-	my $REGEX_PLAIN = shift;
-	my $REGEX_MARKER = shift;
-
-	if ( $type eq 'plain' ) {
-		$REGEX = $REGEX_PLAIN;
-	}
-	elsif ($type eq 'marker') {
-		$REGEX = $REGEX_MARKER;
-	}
-
-	open (INPUT,"<out-$hostname.txt");
-	while (<INPUT>) {
-		if (/$REGEX/) {
-			return $1;
-		}
-	}
-	close INPUT;
-
-	return 0;
-}
-
-# determine capabilities of platform
-open (INPUT, "$PERFCTR -a |");
-
-while (<INPUT>) {
-    if (/(.+):/) {
-        $GROUPS{$1}='true';
-    }
-}
-
-close INPUT;
-
-mkdir $RESULT_DIR if (not -d $RESULT_DIR);
-
-
-# collect tests
-chdir ("$TEST_ROOT/TESTS") or die "Cannot change in $TEST_ROOT/TESTS $!\n";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-my $TESTS = {};
-my $test_ptr;
-
-while (defined(my $file = readdir(DIR))) {
-    if ($file !~ /^\./) {
-        print "SCANNING $file\n";
-        open (TESTCASE, "<$file");
-        $file =~ s/.txt//;
-        $TESTS->{$file}->{benchmarks} = [];
-
-        while ( <TESTCASE> ) {
-
-            if (/REGEX_BENCH[ ](.+)/) {
-                $TESTS->{$file}->{REGEX_BENCH} = $1;
-            } elsif (/REGEX_PERF[ ](.+)/) {
-                $TESTS->{$file}->{REGEX_PERF} = $1;
-            } elsif (/TEST\s+(.+)/) {
-                push (@{ $TESTS->{$file}->{benchmarks} },
-                    {name => $1,
-                     runs => 0,
-                     variants => []});
-
-                $test_ptr = $TESTS->{$file}->{benchmarks}[-1];
-
-            } elsif (/RUNS\s+(.+)/) {
-                $test_ptr->{runs} = $1;
-            } elsif (/VARIANT\s+(.+B)\s+([0-9]+)/) {
-                push (@{ $test_ptr->{variants} },{size => $1, iter => $2});
-            }
-        }
-        close TESTCASE;
-    }
-}
-
-closedir DIR;
-chdir "$TEST_ROOT";
-
-# Read in Test set
-my %FILTER;
-open FILE,"<SET.txt";
-while ( <FILE> ) {
-    if ( not /^#/ ) {
-        chomp;
-        $FILTER{$_} = 'true';
-    }
-}
-close FILE;
-
-#run tests
-foreach my $test ( keys %$TESTS ) {
-
-    if ((exists $GROUPS{$test}) and (exists $FILTER{$test})) {
-        print "RUNNING $test : ";
-
-        foreach my $bench ( @{ $TESTS->{$test}->{benchmarks} } ) {
-            my $benchmark = $bench->{name};
-            my $runs      = $bench->{runs};
-            open (DATAFILE1, ">out-$hostname-1.dat");
-            open (DATAFILE2, ">out-$hostname-2.dat");
-            my $globalrun = 0;
-            print "$bench->{name} ";
-
-            foreach my $variant ( @{ $bench->{variants} } ) {
-                foreach ( 0 ... $runs ) {
-                    print DATAFILE1 "$globalrun ";
-                    print DATAFILE2 "$globalrun ";
-                    #print "$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
-                    system ("$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
-                    my $result = extract_result('plain',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
-                    print DATAFILE1 "$result\n";
-                    #print "$PERFCTR  -C E:". $domain .":0 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
-                    system ("$PERFCTR  -C E:". $domain .":1 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
-                    $result = extract_result('marker',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
-                    print DATAFILE2 "$result\n";
-                    $globalrun++;
-                }
-            }
-
-            close DATAFILE1;
-            close DATAFILE2;
-
-#output results
-            if (system('gracebat >/dev/null 2>&1') ) {
-                mkdir "$RESULT_DIR/tmp" if (not -d "$RESULT_DIR/tmp");
-
-                copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-plain.dat");
-                copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-marker.dat");
-
-            } else {
-
-                my $series = [];
-
-                push @{$series}, 
-                { "title"     =>  "plain",
-                    "data file" =>  "$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "2",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "2",
-                        "color"     => "2",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "2",
-                    }
-                };
-
-                push @{$series}, 
-                { "title"     =>  "marker",
-                    "data file" =>  "$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "4",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "3",
-                        "color"     => "4",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "4",
-                    }
-                };
-
-                xmgrace ({"title"           => "$test",
-                        "subtitle"          => "$bench->{name}",
-                        "legend"            => "0.8,0.7",
-                        "device"            => 'PNG',
-                        "output file"       => "$RESULT_DIR/$test\_".$bench->{name}.".png",
-                        "grace output file" => "$RESULT_DIR/$test\_".$bench->{name}.".agr",
-                        "xaxis label"       => "run",
-                        "yaxis label"       => "MFlops/s / MBytes/s"
-                    },
-                    $series);
-            }
-        }
-        print "\n";
-    }
-}
-
-unlink 'out-$hostname.txt';
-unlink 'out-$hostname-1.dat';
-unlink 'out-$hostname-2.dat';
-
-
diff --git a/test/accuracy/likwid-tester-plot b/test/accuracy/likwid-tester-plot
deleted file mode 100755
index ec6af41..0000000
--- a/test/accuracy/likwid-tester-plot
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-
-my $TEST_ROOT = abs_path('./');
-my $machine = $ARGV[0];
-my $RESULT_DIR = "$TEST_ROOT/RESULTS/$machine";
-
-chdir "$TEST_ROOT/RESULTS/$machine/tmp/";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-
-while (defined(my $file = readdir(DIR))) {
-
-    if ($file =~ /([A-Z0-9_]+)-(.*)-marker\.dat/) {
-                my $series = [];
-                my $test = $1;
-                my $name = $2;
-
-                push @{$series}, 
-                { "title"     =>  "plain",
-                    "data file" =>  "$TEST_ROOT/RESULTS/$machine/tmp/$test-$name-plain.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "2",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "2",
-                        "color"     => "2",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "2",
-                    }
-                };
-
-                push @{$series}, 
-                { "title"     =>  "marker",
-                    "data file" =>  "$TEST_ROOT/RESULTS/$machine/tmp/$file",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "4",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "3",
-                        "color"     => "4",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "4",
-                    }
-                };
-
-                xmgrace ({"title"           => "$test",
-                        "subtitle"          => "$name",
-                        "legend"            => "0.8,0.7",
-                        "device"            => 'PNG',
-                        "output file"       => "$RESULT_DIR/$test\_".$name.".png",
-                        "grace output file" => "$RESULT_DIR/$test\_".$name.".agr",
-                        "xaxis label"       => "run",
-                        "yaxis label"       => "MFlops/s / MBytes/s"
-                    },
-                    $series);
-    }
-}
-
diff --git a/test/executable_tests/Makefile b/test/executable_tests/Makefile
index 08acc2a..11b12b7 100644
--- a/test/executable_tests/Makefile
+++ b/test/executable_tests/Makefile
@@ -1,6 +1,6 @@
 
 
-all: topology pin perfctr memsweeper powermeter features bench genCfg setFreq
+all: topology pin markerAPI perfctr memsweeper powermeter bench genTopoCfg setFrequencies
 
 topology:
 	./tester.sh likwid-topology
@@ -12,11 +12,11 @@ memsweeper:
 	./tester.sh likwid-memsweeper
 powermeter:
 	./tester.sh likwid-powermeter
-features:
-	./tester.sh likwid-features
 bench:
 	./tester.sh likwid-bench
-genCfg:
-	./tester.sh likwid-genCfg
-setFreq:
-	./tester.sh likwid-setFreq
+genTopoCfg:
+	./tester.sh likwid-genTopoCfg
+setFrequencies:
+	./tester.sh likwid-setFrequencies
+markerAPI:
+	make -s -C .. streamGCC
diff --git a/test/executable_tests/README b/test/executable_tests/README
index 99ab560..45fbe9a 100644
--- a/test/executable_tests/README
+++ b/test/executable_tests/README
@@ -6,3 +6,6 @@ For batch testing all executables simply type make
 
 All lines in the <executable>.txt file are executed and the output evaluated.
 Only simple checks are made using grep.
+
+for testing likwid-mpirun, Intel MPI must be present on your system. It only
+tests the command line options and runs only on the local host.
diff --git a/test/executable_tests/likwid-bench.txt b/test/executable_tests/likwid-bench.txt
index 474b160..72670d9 100644
--- a/test/executable_tests/likwid-bench.txt
+++ b/test/executable_tests/likwid-bench.txt
@@ -3,27 +3,22 @@
 -v | EXIT 0 | GREP likwid-bench
 -p | EXIT 0 | GREP Domain
 -a | EXIT 0 | GREP sum
--i | EXIT 1 | GREP requires an argument
+-i | EXIT 1 | GREP option requires an argument
 -i 0 | EXIT 1 | GREP Iterations must be greater than 0
--i 100 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--l | EXIT 1 | GREP requires an argument
+-i 100 | EXIT 1 | GREP At least one workgroup (-w) must be set on commandline
+-l | EXIT 1 | GREP option requires an argument
 -l sum | EXIT 0 | GREP Name: sum
--l XXX | EXIT 0 | GREP Unknown test case XXX
--t | EXIT 1 | GREP requires an argument
--t sum | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--t XXX | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g | EXIT 1 | GREP requires an argument
--g 0 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g 1 | EXIT 1 | GREP workgroups requested but only 0 given on commandline
--g X | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--w | EXIT 1 | GREP requires an argument
--g 1 -w X | EXIT 1 | GREP You need to specify a test case first
--t sum -g 1 -w X | EXIT 1 | GREP Error in parsing workgroup string
--t sum -g 1 -w N:1 | EXIT 1 | GREP Cannot parse string
--t XXX -g 1 -w N:1MB:1 | EXIT 1 | GREP You need to specify a test case first
--g 1 -w N:100kB:1 | EXIT 1 | GREP You need to specify a test case first
--i 100 -t sum -g 1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 2 -w N:100kB:1 | EXIT 1 | GREP workgroups requested but only 1 given on commandline
--i 100 -t sum -g 2 -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 1 -w N:100kB:2:1 | EXIT 1 | GREP Error in parsing workgroup string
--i 100 -t sum -g 1 -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
+-l XXX | EXIT 1 | GREP Unknown test case XXX
+-t | EXIT 1 | GREP option requires an argument
+-t sum | EXIT 1 | GREP At least one workgroup (-w) must be set on commandline
+-t XXX | EXIT 1 | GREP Unknown test case XXX
+-w | EXIT 1 | GREP option requires an argument
+-w X | EXIT 1 | GREP Unknown test case. Please check likwid-bench -a for available tests
+-t sum -w X | EXIT 1 | GREP Misformated workgroup string
+-t sum -w N:1 | EXIT 1 | GREP Stream size cannot be read
+-t XXX -w N:1MB:1 | EXIT 1 | GREP Unknown test case XXX
+-w N:100kB:1 | EXIT 1 | GREP Unknown test case. Please check likwid-bench -a for available tests
+-t sum -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-t sum -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
+-t sum -w N:100kB:2:1 | EXIT 1 | GREP Misformated workgroup string
+-t sum -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
diff --git a/test/executable_tests/likwid-features.txt b/test/executable_tests/likwid-features.txt
deleted file mode 100644
index ce95592..0000000
--- a/test/executable_tests/likwid-features.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-| EXIT 0 | GREP Performance monitoring | GREP CPU core id
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-features
--c | EXIT 1 | GREP option requires an argument
--s | EXIT 1 | GREP option requires an argument
--u | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--s HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--u HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
diff --git a/test/executable_tests/likwid-genCfg.txt b/test/executable_tests/likwid-genCfg.txt
deleted file mode 100644
index 6369b70..0000000
--- a/test/executable_tests/likwid-genCfg.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-| EXIT 1 | GREP Permission denied
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-genCfg
--o | EXIT 1 | GREP option requires an argument
--o /tmp/topo.txt | EXIT 0 | GREP CPU name
diff --git a/test/executable_tests/likwid-genTopoCfg.txt b/test/executable_tests/likwid-genTopoCfg.txt
new file mode 100644
index 0000000..1323e7b
--- /dev/null
+++ b/test/executable_tests/likwid-genTopoCfg.txt
@@ -0,0 +1,5 @@
+| EXIT 1 | GREP Cannot open file
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-genTopoCfg
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/topo.txt | EXIT 0
diff --git a/test/executable_tests/likwid-memsweeper.txt b/test/executable_tests/likwid-memsweeper.txt
index 6c4cd0e..b90fbe8 100644
--- a/test/executable_tests/likwid-memsweeper.txt
+++ b/test/executable_tests/likwid-memsweeper.txt
@@ -1,8 +1,8 @@
 | EXIT 0 | GREP Sweeping domain
 -h | EXIT 0 | GREP Help message
 -v | EXIT 0 | GREP likwid-memsweeper
--c | EXIT 1 | GREP option requires an argument
--c - | EXIT 1 | GREP Cannot parse string
--c -1 | EXIT 0 | GREP Sweeping domain
+-c | EXIT 1 | GREP Option requires an argument
+-c - | EXIT 1 | GREP Cannot parse node string
+-c -1 | EXIT 1 | GREP Cannot parse node string
 -c 0 | EXIT 0 | GREP Sweeping domain
--c 10 | EXIT 1 | GREP ERROR | GREP numa
+-c 10 | EXIT 1 | GREP Cannot parse node string
diff --git a/test/executable_tests/likwid-mpirun.txt b/test/executable_tests/likwid-mpirun.txt
new file mode 100644
index 0000000..6287100
--- /dev/null
+++ b/test/executable_tests/likwid-mpirun.txt
@@ -0,0 +1,39 @@
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-mpirun
+-d | EXIT 1 | GREP No option
+-np | EXIT 1 | GREP Option requires an argument
+-nperdomain | EXIT 1 | GREP Option requires an argument
+-pin | EXIT 1 | GREP Option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
+-mpi | EXIT 1 | GREP Option requires an argument
+-omp | EXIT 1 | GREP Option requires an argument
+-hostfile | EXIT 1 | GREP Option requires an argument
+-g | EXIT 1 | GREP Option requires an argument
+-m | EXIT 1 | GREP No option
+-O | EXIT 1 | GREP No option
+-f | EXIT 1 | GREP No option
+-np 1 | EXIT 1 | GREP No executable given on commandline
+-nperdomain N:1 | EXIT 1 | GREP No executable given on commandline
+-pin N:1 | EXIT 1 | GREP No executable given on commandline
+-mpi asd | EXIT 1 | GREP No option
+-omp asd | EXIT 1 | GREP No option
+-hostfile asd | EXIT 1 | GREP No option
+-g asd | EXIT 1 | GREP No option
+-np 1 cat /proc/version | EXIT 0 | GREP Linux
+-nperdomain N:1 cat /proc/version | EXIT 0 | GREP Linux
+-pin N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -d cat /proc/version | EXIT 0 | GREP Linux
+-nperdomain N:1 -d cat /proc/version | EXIT 0 | GREP Linux
+-pin N:1 -d cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -mpi intelmpi cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -mpi openmpi cat /proc/version | EXIT 1 | GREP Cannot find executable
+-np 1 -mpi mvapich2 cat /proc/version | EXIT 1 | GREP Cannot find executable
+-np 1 -g ASD -f cat /proc/version | EXIT 1 | GREP Empty event list
+-np 1 -g CLOCK -f cat /proc/version | EXIT 0 | GREP Linux | GREP CPI
+-np 1 -nperdomain N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -pin N:1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -pin N:1 -s 0x1 cat /proc/version | EXIT 0 | GREP Linux
+-np 1 -hostfile 123456789 cat /proc/version | EXIT 1 | GREP Cannot open hostfile 123456789
+-np 1 -g CLOCK -f -O cat /proc/version | EXIT 0 | GREP Linux | GREP CPI,
+-np 1 -g CLOCK -f -O -m ../streamGCC | EXIT 0 | GREP Region: triad | GREP CPI,
+-np 1 -f -O -m ../streamGCC | EXIT 1 | GREP You selected the MarkerAPI feature
diff --git a/test/executable_tests/likwid-perfctr.txt b/test/executable_tests/likwid-perfctr.txt
index 80ac60d..e3a7fa9 100644
--- a/test/executable_tests/likwid-perfctr.txt
+++ b/test/executable_tests/likwid-perfctr.txt
@@ -2,37 +2,44 @@
 -h | EXIT 0 | GREP Help message
 -v | EXIT 0 | GREP likwid-perfctr
 -i | EXIT 0 | GREP CPU family
--V -c 0 hostname | EXIT 0 | GREP NOTICE
--V | EXIT 1 | GREP You must specify at least one processor
--g | EXIT 1 | GREP option requires an argument
+-V 1 -c 0 hostname | EXIT 0 | GREP Option(s) -g <string> must be given on commandline
+-V 1 | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-V | EXIT 1 | GREP Option requires an argument
+-g | EXIT 1 | GREP Option requires an argument
 -g BRANCH -H | EXIT 0 | GREP Group BRANCH:
--a | EXIT 0 | GREP Available groups
--V -e | EXIT 0 | GREP This architecture
--t 200ms | EXIT 1 | GREP You must specify at least one processor
--c | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 1 | GREP You have to specify a program to measure as argument
--t 200ms -c 0 | EXIT 1 | GREP Executable must be given on commandline
--S | EXIT 1 | GREP option requires an argument
--o | EXIT 1 | GREP option requires an argument
--o /tmp/test | EXIT 1 | GREP Outputfile has no filetype suffix
--o /tmp/test.txt | EXIT 1 | GREP You must specify at least one processor
--S 1 | EXIT 1 | GREP You must specify at least one processor
--S 1 -c 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -C 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -c 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -C 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -c 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--c 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH -m hostname | EXIT 1 | GREP  The marker result file could not be found
+-a | EXIT 0 | GREP Group Name | GREP Description
+-e | EXIT 0 | GREP This architecture
+-t 200ms | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-c | EXIT 1 | GREP Option requires an argument
+-c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-t 200ms -c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S | EXIT 1 | GREP Option requires an argument
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/test | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-c 0 -o /tmp/test | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-C 0 -g BRANCH -o /tmp/test | EXIT 1 | GREP No Executable can be found on commandline
+-C 0 -g BRANCH -o /tmp/test hostname | EXIT 0
+-o /tmp/test.txt | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-C 0 -g BRANCH -o /tmp/test.txt hostname | EXIT 0 | NGREP Cannot find filter script, save output in CSV format
+-S 1s | EXIT 1 | GREP Option -c <list> or -C <list> must be given on commandline
+-S 1s -c 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S 1s -C 0 | EXIT 1 | GREP Option(s) -g <string> must be given on commandline
+-S 1s -c 0 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP Branch
+-S 1s -C 0 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP Branch
+-S 1s -c 0,1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c 0-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c 0,1-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0,1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -C 0,1-1 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP core 1 | GREP Branch
+-S 1s -c E:N:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -c E:N:2:1:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -c M:scatter -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C E:N:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C E:N:2:1:2 -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-S 1s -C M:scatter -g BRANCH -f | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-c 0 -g BRANCH -f hostname | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH -f hostname | EXIT 0 | GREP Group 1: BRANCH | GREP core 0 | GREP Branch
+-C 0 -g BRANCH -f -m hostname | EXIT 1 | GREP No regions
+-C 0 -g BRANCH -f -t 200ms hostname | EXIT 0 | GREP CORES: 0
+-C 0 -g BRANCH -f -m ../streamGCC | EXIT 0 | GREP Region triad | GREP Region copy
diff --git a/test/executable_tests/likwid-pin.txt b/test/executable_tests/likwid-pin.txt
index 801f79c..64d2d96 100644
--- a/test/executable_tests/likwid-pin.txt
+++ b/test/executable_tests/likwid-pin.txt
@@ -4,23 +4,23 @@
 -i hostname | EXIT 0 | GREP Set mem_policy to interleaved
 -S | EXIT 1 |GREP Executable must be given on commandline
 -S hostname | EXIT 0 | GREP Sweeping memory
--c | EXIT 1 |GREP option requires an argument
--p | EXIT 0 | GREP Domain | GREP Tag
+-c | EXIT 1 |GREP Option requires an argument
+-p | EXIT 0 | GREP Domain
 -c 0 | EXIT 1 | GREP Executable must be given on commandline
 -c 0 -p | EXIT 0 | GREP 0
 -c N:0 -p | EXIT 0 | GREP 0
 -c S0:0-1 -p | EXIT 0 | GREP 0,1
 -c N:0 at N:1 -p | EXIT 0 | GREP 0,1
 -c N:0 at N:1 at N:2 -p | EXIT 0 | GREP 0,1,2
--c C0:1-0 -p | EXIT 1 | GREP Range End
+-c C0:1-0 -p | EXIT 0 | GREP 1,0
 -c E:N:1 -p | EXIT 0 | GREP 0
 -c E:N:2 -p | EXIT 0 | LISTLEN , 2
 -c E:N:2:1:2 -p | EXIT 0 | LISTLEN , 2
 -c E:N:2:1:2 -d . -p | EXIT 0 | LISTLEN . 2
 -c M:scatter -p | EXIT 0
--s | EXIT 1 | GREP option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
 -s 0x1 | EXIT 1 | GREP Executable must be given on commandline
--s 0x1 hostname | EXIT 0 | GREP Main PID
+-s 0x1 hostname | EXIT 0
 -q | EXIT 1 | GREP Executable must be given on commandline
--q hostname | EXIT 1 | NGREP Main PID
+-q hostname | EXIT 0
 
diff --git a/test/executable_tests/likwid-powermeter.txt b/test/executable_tests/likwid-powermeter.txt
index f733b06..b09412e 100644
--- a/test/executable_tests/likwid-powermeter.txt
+++ b/test/executable_tests/likwid-powermeter.txt
@@ -1,14 +1,20 @@
-| EXIT 0 | GREP Help message
+| EXIT 0 | GREP Runtime: 2
 -h | EXIT 0 | GREP Help message
 -v | EXIT 0 | GREP likwid-powermeter
 -i | EXIT 0 | GREP Base clock | GREP Power
--c | EXIT 1 | GREP option requires an argument | GREP Help message
--s | EXIT 1 | GREP option requires an argument | GREP Help message
--M | EXIT 1 | GREP option requires an argument | GREP Help message
--s 1 | EXIT 0 | GREP consumed
--c 0 | EXIT 1 | GREP Commandline option -c requires an executable if not used in combination with -s
--p | EXIT 1 | GREP Commandline option -p requires an executable
--c 0 -s 1 | EXIT 0 | GREP consumed | GREP Socket 0
--p hostname | EXIT 0 | Measuring group CLOCK
--c 0 hostname | EXIT 0 | GREP consumed | GREP Socket 0
--M 1 | EXIT 1 | GREP Either -s <seconds> or executable must be given on commandline
+-c | EXIT 1 | GREP Option requires an argument
+-s | EXIT 1 | GREP Option requires an argument
+-M | EXIT 1 | GREP Option requires an argument
+-V | EXIT 1 | GREP Option requires an argument
+-V 1 | EXIT 0 | GREP Base clock | GREP Runtime: 2
+-s 1 | EXIT 1 | GREP Cannot parse time
+-s 1s | EXIT 0 | GREP Runtime: 1
+-c 0 | EXIT 0 | GREP Runtime: 2
+-p | EXIT 0 | GREP Group 1: CLOCK
+-c 0 -s 1 | EXIT 0 | GREP Cannot parse time
+-p hostname | EXIT 0 | Group 1: CLOCK
+-c 0 hostname | EXIT 0 | GREP consumed | GREP socket 0
+-M 1 | EXIT 0 | GREP Runtime: 2
+-M 0 | EXIT 1 | GREP Operation not permitted
+-t | EXIT 0 | GREP Current core temperatures: | GREP Socket 0 Core 0:
+-f | EXIT 0 | GREP Current core temperatures: | GREP Socket 0 Core 0:
diff --git a/test/executable_tests/likwid-setFreq.txt b/test/executable_tests/likwid-setFreq.txt
deleted file mode 100644
index 56c495b..0000000
--- a/test/executable_tests/likwid-setFreq.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-| EXIT 1 | GREP Usage
-0 | EXIT 1 | GREP Usage
-0 0 | EXIT 1 | GREP Frequency must be greater than 0
-0 -1 | EXIT 1 | GREP Frequency must be greater than 0
--1 -1 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
-100 0 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
diff --git a/test/executable_tests/likwid-setFrequencies.txt b/test/executable_tests/likwid-setFrequencies.txt
new file mode 100644
index 0000000..821f925
--- /dev/null
+++ b/test/executable_tests/likwid-setFrequencies.txt
@@ -0,0 +1,14 @@
+| EXIT 0 | GREP Help message
+-h | EXIT 0 | GREP Help message
+-v | EXIT 0 | GREP likwid-setFrequencies
+-p | EXIT 0 | GREP Current frequencies: | GREP CPU | grep GHz
+-l | EXIT 0 | GREP Available frequencies:
+-m | EXIT 0 | GREP Available governors:
+-c | EXIT 1 | GREP Option requires an argument
+-c 0 | EXIT 1 | GREP You need to set either a frequency or governor for the selected CPUs on commandline
+-g | EXIT 1 | GREP Option requires an argument
+-f | EXIT 1 | GREP Option requires an argument
+-g performance | EXIT 0
+-f FREQ | EXIT 0
+-c 0 -g conservative | EXIT 0
+-c 0 -f FREQ | EXIT 0
diff --git a/test/executable_tests/likwid-topology.txt b/test/executable_tests/likwid-topology.txt
index 810b1e9..3e6eed5 100644
--- a/test/executable_tests/likwid-topology.txt
+++ b/test/executable_tests/likwid-topology.txt
@@ -1,11 +1,14 @@
--h | EXIT 0 | Help message
+-h | EXIT 0 | GREP Options
 -v | EXIT 0 | GREP likwid-topology
 -c | EXIT 0 | GREP Cache line size
 -C | EXIT 0 | GREP CPU clock
 -g | EXIT 0 | GREP +--------
+-V | EXIT 1 | GREP Option requires an argument
+-V 1 | EXIT 0 | grep Hardware Thread Topology
 -g -v | EXIT 0 | GREP likwid-topology
 -c -g | EXIT 0 | GREP +-------- | GREP Cache line size
 -c -g -C | EXIT 0 | GREP +-------- | GREP Cache line size | GREP CPU clock
--o | EXIT 1
--o /tmp/out | EXIT 1 | GREP filter suffix
+-O | EXIT 0 | GREP STRUCT,Info
+-o | EXIT 1 | GREP Option requires an argument
+-o /tmp/out | EXIT 0
 -o /tmp/out.txt | EXIT 0
diff --git a/test/executable_tests/tester.sh b/test/executable_tests/tester.sh
index 71342df..119613f 100755
--- a/test/executable_tests/tester.sh
+++ b/test/executable_tests/tester.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
-
 if [ $# -ne 1 ]; then
     echo "You need to give application to test on commandline"
     exit 1
 fi
 
-EXECPATH=../..
+EXECPATH=/usr/local/bin
 EXEC=$1
 TMPFILE=/tmp/testout
+FREQ="2.3"
 
 f_grep() {
     ARG="$1"
@@ -41,15 +41,25 @@ if [ ! -e ${EXEC}.txt ]; then
     echo "Cannot find testfile ${EXEC}.txt"
     exit 1
 fi
+if [ "${EXEC}" == "likwid-setFrequencies" ]; then
+    FREQ=$(likwid-setFrequencies -l | grep -v frequencies | awk '{print $2}')
+    CURFREQ=$(likwid-setFrequencies -p | head -n2 | tail -n 1 | rev | awk '{print $2}' | rev)
+fi
+if [ "${EXEC}" == "likwid-mpirun" ]; then
+    if [ -z "$(which mpiexec)" ] && [ -z "$(which mpiexec.hydra)" ] && [ -z "$(which mpirun)" ]; then
+        echo "Cannot find MPI implementation, neither mpiexec, mpiexec.hydra nor mpirun can be found in any directory in PATH"
+        exit 1
+    fi
+fi
 
 while read -r LINE || [[ -n $LINE ]]; do
     if [ -z "${LINE}" ]; then continue; fi
     if [[ "${LINE}" =~ \#.* ]]; then continue; fi
-
     OPTIONS=$(echo "${LINE}" | cut -d '|' -f 1)
+    OPTIONS=${OPTIONS//'FREQ'/"${FREQ}"}
     RESULTS=$(echo "${LINE}" | cut -d '|' -f 2-)
     NUM_RESULTS="${RESULTS//[^|]}"
-    EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1  ; echo $?)
+    EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1 </dev/null; echo $?)
     STATE=0
     for ((i=1;i<=${#NUM_RESULTS}+1;i++)); do
         RESULT=$(echo ${RESULTS} | cut -d '|' -f ${i})
@@ -78,3 +88,9 @@ while read -r LINE || [[ -n $LINE ]]; do
 done < ${EXEC}.txt
 
 
+if [ "${EXEC}" == "likwid-setFrequencies" ]; then
+    ${EXEC} -f "${CURFREQ}"
+fi
+
+rm -f /tmp/topo.txt /tmp/test /tmp/test.txt /tmp/out.txt /tmp/out
+
diff --git a/test/serial.c b/test/serial.c
new file mode 100644
index 0000000..3debf10
--- /dev/null
+++ b/test/serial.c
@@ -0,0 +1,43 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <likwid.h>
+
+int main(int argc, char* argv[])
+{
+    int i, j;
+    int size;
+    double* vector;
+    if (argc != 2)
+        return 1;
+
+    size = atoi(argv[1]);
+    vector = (double*) malloc(size * sizeof(double));
+    if (!vector)
+        return 2;
+
+    LIKWID_MARKER_INIT;
+
+    LIKWID_MARKER_START("init");
+    for (i=0;i<size;i++)
+        vector[i] = 2.0;
+    LIKWID_MARKER_STOP("init");
+
+
+    LIKWID_MARKER_START("pow");
+    for (j=0;j<10;j++)
+    {
+        for (i=0;i<size;i++)
+            vector[i] = vector[i] * vector[i];
+    }
+    LIKWID_MARKER_STOP("pow");
+
+    LIKWID_MARKER_CLOSE;
+
+    free(vector);
+    return 0;
+
+
+
+}
diff --git a/test/stream.c b/test/stream.c
deleted file mode 100644
index 0214747..0000000
--- a/test/stream.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*-----------------------------------------------------------------------*/
-/* Program: Stream                                                       */
-/* Revision: $Id: stream.c,v 5.8 2007/02/19 23:57:39 mccalpin Exp mccalpin $ */
-/* Original code developed by John D. McCalpin                           */
-/* Programmers: John D. McCalpin                                         */
-/*              Joe R. Zagar                                             */
-/*                                                                       */
-/* This program measures memory transfer rates in MB/s for simple        */
-/* computational kernels coded in C.                                     */
-/*-----------------------------------------------------------------------*/
-/* Copyright 1991-2005: John D. McCalpin                                 */
-/*-----------------------------------------------------------------------*/
-/* License:                                                              */
-/*  1. You are free to use this program and/or to redistribute           */
-/*     this program.                                                     */
-/*  2. You are free to modify this program for your own use,             */
-/*     including commercial use, subject to the publication              */
-/*     restrictions in item 3.                                           */
-/*  3. You are free to publish results obtained from running this        */
-/*     program, or from works that you derive from this program,         */
-/*     with the following limitations:                                   */
-/*     3a. In order to be referred to as "STREAM benchmark results",     */
-/*         published results must be in conformance to the STREAM        */
-/*         Run Rules, (briefly reviewed below) published at              */
-/*         http://www.cs.virginia.edu/stream/ref.html                    */
-/*         and incorporated herein by reference.                         */
-/*         As the copyright holder, John McCalpin retains the            */
-/*         right to determine conformity with the Run Rules.             */
-/*     3b. Results based on modified source code or on runs not in       */
-/*         accordance with the STREAM Run Rules must be clearly          */
-/*         labelled whenever they are published.  Examples of            */
-/*         proper labelling include:                                     */
-/*         "tuned STREAM benchmark results"                              */
-/*         "based on a variant of the STREAM benchmark code"             */
-/*         Other comparable, clear and reasonable labelling is           */
-/*         acceptable.                                                   */
-/*     3c. Submission of results to the STREAM benchmark web site        */
-/*         is encouraged, but not required.                              */
-/*  4. Use of this program or creation of derived works based on this    */
-/*     program constitutes acceptance of these licensing restrictions.   */
-/*  5. Absolutely no warranty is expressed or implied.                   */
-/*-----------------------------------------------------------------------*/
-#define _GNU_SOURCE
-#include <stdlib.h>
-# include <stdio.h>
-# include <math.h>
-# include <float.h>
-# include <omp.h>
-# include <limits.h>
-# include <sys/time.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <sched.h>
-#include <time.h>
-#include <pthread.h>
-
-/* INSTRUCTIONS:
- *
- *	1) Stream requires a good bit of memory to run.  Adjust the
- *          value of 'N' (below) to give a 'timing calibration' of 
- *          at least 20 clock-ticks.  This will provide rate estimates
- *          that should be good to about 5% precision.
- */
-
-# define N	60000000
-# define NTIMES	10
-# define OFFSET	0
-
-/*
- *	3) Compile the code with full optimization.  Many compilers
- *	   generate unreasonably bad code before the optimizer tightens
- *	   things up.  If the results are unreasonably good, on the
- *	   other hand, the optimizer might be too smart for me!
- *
- *         Try compiling with:
- *               cc -O stream_omp.c -o stream_omp
- *
- *         This is known to work on Cray, SGI, IBM, and Sun machines.
- *
- *
- *	4) Mail the results to mccalpin at cs.virginia.edu
- *	   Be sure to include:
- *		a) computer hardware model number and software revision
- *		b) the compiler flags
- *		c) all of the output from the test case.
- * Thanks!
- *
- */
-#define gettid() syscall(SYS_gettid)
-#include <likwid.h>
-
-# define HLINE "-------------------------------------------------------------\n"
-
-# ifndef MIN
-# define MIN(x,y) ((x)<(y)?(x):(y))
-# endif
-# ifndef MAX
-# define MAX(x,y) ((x)>(y)?(x):(y))
-# endif
-
-static double	a[N+OFFSET],
-		b[N+OFFSET],
-		c[N+OFFSET];
-
-static double	avgtime[4] = {0}, maxtime[4] = {0},
-		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-
-static char	*label[4] = {"Copy:      ", "Scale:     ",
-    "Add:       ", "Triad:     "};
-
-static double	bytes[4] = {
-    2 * sizeof(double) * N,
-    2 * sizeof(double) * N,
-    3 * sizeof(double) * N,
-    3 * sizeof(double) * N
-    };
-
-static int
-getProcessorID(cpu_set_t* cpu_set)
-{
-    int processorId;
-
-    for (processorId=0;processorId<128;processorId++)
-    {
-        if (CPU_ISSET(processorId,cpu_set))
-        {
-            break;
-        }
-    }
-    return processorId;
-}
-
-int  threadGetProcessorId()
-{
-    cpu_set_t  cpu_set;
-    CPU_ZERO(&cpu_set);
-    sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set);
-
-    return getProcessorID(&cpu_set);
-}
-
-extern double mysecond();
-extern void checkSTREAMresults();
-#ifdef _OPENMP
-extern int omp_get_num_threads();
-#endif
-int
-main()
-    {
-    int			quantum, checktick();
-    int			BytesPerWord;
-    register int	j, k;
-    double		scalar, t, times[4][NTIMES];
-
-    /* --- SETUP --- determine precision and check timing --- */
-
-    printf(HLINE);
-    printf("STREAM version $Revision: 5.8 $\n");
-    printf(HLINE);
-    BytesPerWord = sizeof(double);
-    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
-	BytesPerWord);
-
-    printf(HLINE);
-    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
-    printf("Total memory required = %.1f MB.\n",
-	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
-    printf("Each test is run %d times, but only\n", NTIMES);
-    printf("the *best* time for each is used.\n");
-
-#ifdef LIKWID_PERFMON
-    printf("Using likwid\n");
-#endif
-
-    LIKWID_MARKER_INIT;
-
-#ifdef _OPENMP
-    printf(HLINE);
-#pragma omp parallel
-    {
-	LIKWID_MARKER_THREADINIT;
-#pragma omp master
-	{
-	    k = omp_get_num_threads();
-	    printf ("Number of Threads requested = %i\n",k);
-    }
-
-    printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),threadGetProcessorId());
-    }
-#endif
-
-    LIKWID_MARKER_START("init");
-    /* Get initial value for system clock. */
-//#pragma omp parallel for
-    for (j=0; j<N; j++) {
-	a[j] = 1.0;
-	b[j] = 2.0;
-	c[j] = 0.0;
-	}
-    LIKWID_MARKER_STOP("init");
-
-    printf(HLINE);
-
-    if  ( (quantum = checktick()) >= 1) 
-	printf("Your clock granularity/precision appears to be "
-	    "%d microseconds.\n", quantum);
-    else {
-	printf("Your clock granularity appears to be "
-	    "less than one microsecond.\n");
-	quantum = 1;
-    }
-
-    t = mysecond();
-#pragma omp parallel for
-    for (j = 0; j < N; j++)
-	a[j] = 2.0E0 * a[j];
-    t = 1.0E6 * (mysecond() - t);
-
-    printf("Each test below will take on the order"
-	" of %d microseconds.\n", (int) t  );
-    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
-    printf("Increase the size of the arrays if this shows that\n");
-    printf("you are not getting at least 20 clock ticks per test.\n");
-
-    printf(HLINE);
-
-    printf("WARNING -- The above is only a rough guideline.\n");
-    printf("For best results, please be sure you know the\n");
-    printf("precision of your system timer.\n");
-    printf(HLINE);
-
-    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
-
-    scalar = 3.0;
-    for (k=0; k<NTIMES; k++)
-    {
-        times[0][k] = mysecond();
-#pragma omp parallel
-	{
-        LIKWID_MARKER_START("copy");
-#pragma omp for
-        for (j=0; j<N; j++)
-            c[j] = a[j];
-        LIKWID_MARKER_STOP("copy");
-	}
-        times[0][k] = mysecond() - times[0][k];
-
-        times[1][k] = mysecond();
-#pragma omp parallel
-	{
-        LIKWID_MARKER_START("scale");
-#pragma omp for
-        for (j=0; j<N; j++)
-            b[j] = scalar*c[j];
-        LIKWID_MARKER_STOP("scale");
-	}
-        times[1][k] = mysecond() - times[1][k];
-
-        times[2][k] = mysecond();
-#pragma omp parallel
-	{
-        LIKWID_MARKER_START("add");
-#pragma omp for
-        for (j=0; j<N; j++)
-            c[j] = a[j]+b[j];
-        LIKWID_MARKER_STOP("add");
-	}
-        times[2][k] = mysecond() - times[2][k];
-
-        times[3][k] = mysecond();
-#pragma omp parallel
-	{
-        LIKWID_MARKER_START("triad");
-#pragma omp for
-        for (j=0; j<N; j++)
-            a[j] = b[j]+scalar*c[j];
-        LIKWID_MARKER_STOP("triad");
-	}
-        times[3][k] = mysecond() - times[3][k];
-    }
-
-    /*	--- SUMMARY --- */
-
-    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
-	{
-	for (j=0; j<4; j++)
-	    {
-	    avgtime[j] = avgtime[j] + times[j][k];
-	    mintime[j] = MIN(mintime[j], times[j][k]);
-	    maxtime[j] = MAX(maxtime[j], times[j][k]);
-	    }
-	}
-
-    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
-    for (j=0; j<4; j++) {
-	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
-
-	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
-	       1.0E-06 * bytes[j]/mintime[j],
-	       avgtime[j],
-	       mintime[j],
-	       maxtime[j]);
-    }
-    printf(HLINE);
-
-    /* --- Check Results --- */
-    checkSTREAMresults();
-    printf(HLINE);
-
-    LIKWID_MARKER_CLOSE;
-    return 0;
-}
-
-# define	M	20
-
-int
-checktick()
-    {
-    int		i, minDelta, Delta;
-    double	t1, t2, timesfound[M];
-
-/*  Collect a sequence of M unique time values from the system. */
-
-    for (i = 0; i < M; i++) {
-	t1 = mysecond();
-	while( ((t2=mysecond()) - t1) < 1.0E-6 )
-	    ;
-	timesfound[i] = t1 = t2;
-	}
-
-/*
- * Determine the minimum difference between these M values.
- * This result will be our estimate (in microseconds) for the
- * clock granularity.
- */
-
-    minDelta = 1000000;
-    for (i = 1; i < M; i++) {
-	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
-	minDelta = MIN(minDelta, MAX(Delta,0));
-	}
-
-   return(minDelta);
-    }
-
-
-
-/* A gettimeofday routine to give access to the wall
-   clock timer on most UNIX-like systems.  */
-
-#include <sys/time.h>
-
-double mysecond()
-{
-        struct timeval tp;
-        struct timezone tzp;
-        int i;
-
-        i = gettimeofday(&tp,&tzp);
-        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
-}
-
-void checkSTREAMresults ()
-{
-	double aj,bj,cj,scalar;
-	double asum,bsum,csum;
-	double epsilon;
-	int	j,k;
-
-    /* reproduce initialization */
-	aj = 1.0;
-	bj = 2.0;
-	cj = 0.0;
-    /* a[] is modified during timing check */
-	aj = 2.0E0 * aj;
-    /* now execute timing loop */
-	scalar = 3.0;
-	for (k=0; k<NTIMES; k++)
-        {
-            cj = aj;
-            bj = scalar*cj;
-            cj = aj+bj;
-            aj = bj+scalar*cj;
-        }
-	aj = aj * (double) (N);
-	bj = bj * (double) (N);
-	cj = cj * (double) (N);
-
-	asum = 0.0;
-	bsum = 0.0;
-	csum = 0.0;
-	for (j=0; j<N; j++) {
-		asum += a[j];
-		bsum += b[j];
-		csum += c[j];
-	}
-
-#ifndef abs
-#define abs(a) ((a) >= 0 ? (a) : -(a))
-#endif
-	epsilon = 1.e-8;
-
-	if (abs(aj-asum)/asum > epsilon) {
-		printf ("Failed Validation on array a[]\n");
-		printf ("        Expected  : %f \n",aj);
-		printf ("        Observed  : %f \n",asum);
-	}
-	else if (abs(bj-bsum)/bsum > epsilon) {
-		printf ("Failed Validation on array b[]\n");
-		printf ("        Expected  : %f \n",bj);
-		printf ("        Observed  : %f \n",bsum);
-	}
-	else if (abs(cj-csum)/csum > epsilon) {
-		printf ("Failed Validation on array c[]\n");
-		printf ("        Expected  : %f \n",cj);
-		printf ("        Observed  : %f \n",csum);
-	}
-	else {
-		printf ("Solution Validates\n");
-	}
-}
-
diff --git a/test/test-likwidAPI.c b/test/test-likwidAPI.c
new file mode 100644
index 0000000..7a2001f
--- /dev/null
+++ b/test/test-likwidAPI.c
@@ -0,0 +1,2099 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+#include <likwid.h>
+//#include <configuration.h>
+//#include <access.h>
+//#include <types.h>
+//#include <perfmon.h>
+
+typedef struct {
+    char* testname;
+    int(*testfunc)(void);
+    int result;
+} test;
+
+static int verbose = 0;
+
+static char eventset_ok[] = "INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPU_CLK_UNHALTED_REF:FIXC2";
+static char event1_ok[] = "INSTR_RETIRED_ANY";
+static char event2_ok[] = "CPU_CLK_UNHALTED_CORE";
+static char event3_ok[] = "CPU_CLK_UNHALTED_REF";
+static char ctr1_ok[] = "FIXC0";
+static char ctr2_ok[] = "FIXC1";
+static char ctr3_ok[] = "FIXC2";
+static char eventset_option[] = "INSTR_RETIRED_ANY:FIXC0:ANYTHREAD,CPU_CLK_UNHALTED_CORE:FIXC1:ANYTHREAD,CPU_CLK_UNHALTED_REF:FIXC2:ANYTHREAD";
+static int isIntel = 0;
+static char perfgroup_ok[] = "BRANCH";
+static char perfgroup_fail[] = "BRAN";
+
+
+
+
+
+int test_initconfig()
+{
+    int ret;
+    ret = init_configuration();
+    if (ret != 0)
+        goto fail;
+    Configuration_t config = get_configuration();
+    if (config == NULL)
+        goto fail;
+    if ((config->daemonMode != ACCESSMODE_DIRECT) && (config->daemonMode != ACCESSMODE_DAEMON))
+        goto fail;
+    if ((config->daemonMode == ACCESSMODE_DAEMON) && (config->daemonPath == NULL))
+        goto fail;
+    destroy_configuration();
+    return 1;
+fail:
+    destroy_configuration();
+    return 0;
+}
+
+int enable_configuration()
+{
+    init_configuration();
+    return 1;
+}
+
+int disable_configuration()
+{
+    destroy_configuration();
+    return 1;
+}
+
+int test_hpmmode()
+{
+    Configuration_t config;
+    config = get_configuration();
+    int def = config->daemonMode;
+    HPMmode(ACCESSMODE_DIRECT);
+    if (config->daemonMode != ACCESSMODE_DIRECT)
+        goto fail;
+    HPMmode(ACCESSMODE_DAEMON);
+    if (config->daemonMode != ACCESSMODE_DAEMON)
+        goto fail;
+    HPMmode(def);
+    HPMmode(ACCESSMODE_DAEMON+1);
+    if (config->daemonMode != def)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_hpminit()
+{
+    int ret = HPMinit();
+    if (ret != 0)
+        return 0;
+    HPMfinalize();
+    return 1;
+}
+
+int test_hpmaddthread()
+{
+    HPMinit();
+    int ret = HPMaddThread(0);
+    if (ret != 0)
+        return 0;
+    HPMfinalize();
+    return 1;
+}
+
+int enable_hpm()
+{
+    HPMinit();
+    HPMaddThread(0);
+    return 1;
+}
+
+int disable_hpm()
+{
+    HPMfinalize();
+    return 1;
+}
+
+int test_topologyinit()
+{
+    int i, j;
+    int ret = topology_init();
+    if (ret != 0)
+        goto fail;
+    CpuInfo_t cpuinfo = get_cpuInfo();
+    if (cpuinfo == NULL)
+        goto fail;
+    if (cpuinfo->family == 0)
+        goto fail;
+    if (cpuinfo->model == 0)
+        goto fail;
+    if (cpuinfo->osname == NULL)
+        goto fail;
+    if (cpuinfo->name == NULL)
+        goto fail;
+    if (cpuinfo->features == NULL)
+        goto fail;
+    CpuTopology_t cputopo = get_cpuTopology();
+    if (cputopo->threadPool == NULL)
+        goto fail;
+    if (cputopo->cacheLevels == NULL)
+        goto fail;
+    if (cputopo->numHWThreads == 0)
+        goto fail;
+    if (cputopo->activeHWThreads == 0)
+        goto fail;
+    if (cputopo->numSockets == 0)
+        goto fail;
+    if (cputopo->numCoresPerSocket < 1)
+        goto fail;
+    if (cputopo->numThreadsPerCore < 1)
+        goto fail;
+    if (cputopo->numHWThreads > 0)
+    {
+        for (i = 0; i < cputopo->numHWThreads; i++)
+        {
+            for (j=0;j< cputopo->numHWThreads; j++)
+            {
+                if ((i != j) && (cputopo->threadPool[i].apicId == cputopo->threadPool[j].apicId))
+                    goto fail;
+            }
+            if (cputopo->threadPool[i].threadId >= cputopo->numThreadsPerCore)
+            {
+                goto fail;
+            }
+            if (cputopo->threadPool[i].packageId >= cputopo->numSockets)
+            {
+                goto fail;
+            }
+        }
+    }
+    if (cputopo->numCacheLevels > 0)
+    {
+        for (i=0;i<cputopo->numCacheLevels;i++)
+        {
+            if (cputopo->cacheLevels[i].level > cputopo->numCacheLevels)
+            {
+                goto fail;
+            }
+
+        }
+    }
+    isIntel = cpuinfo->isIntel;
+    topology_finalize();
+    return 1;
+fail:
+    topology_finalize();
+    return 0;
+}
+
+int enable_topology()
+{
+    topology_init();
+    return 1;
+}
+
+int disable_topology()
+{
+    topology_finalize();
+    return 1;
+}
+
+int test_numainit()
+{
+    int i = 0;
+    topology_init();
+    numa_init();
+    NumaTopology_t numainfo = get_numaTopology();
+    if (numainfo == NULL)
+        goto fail;
+    if (numainfo->numberOfNodes <= 0)
+        goto fail;
+    if (likwid_getNumberOfNodes() <= 0)
+        goto fail;
+    for (i = 0; i < likwid_getNumberOfNodes(); i++)
+    {
+        if (numainfo->nodes[i].totalMemory == 0)
+            goto fail;
+        if (numainfo->nodes[i].freeMemory == 0)
+            goto fail;
+        if (numainfo->nodes[i].numberOfProcessors == 0)
+            goto fail;
+        if (numainfo->nodes[i].numberOfDistances == 0)
+            goto fail;
+        if (numainfo->nodes[i].numberOfDistances != likwid_getNumberOfNodes())
+            goto fail;
+    }
+    numa_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    numa_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_affinityinit()
+{
+    int i = 0;
+    topology_init();
+    CpuTopology_t cputopo = get_cpuTopology();
+    numa_init();
+    affinity_init();
+    AffinityDomains_t doms = get_affinityDomains();
+    if (doms == NULL)
+        goto fail;
+    if (doms->numberOfSocketDomains != cputopo->numSockets)
+        goto fail;
+    if (doms->numberOfNumaDomains == 0)
+        goto fail;
+    if (doms->numberOfProcessorsPerSocket == 0)
+        goto fail;
+    if (doms->numberOfAffinityDomains == 0)
+        goto fail;
+    if (doms->numberOfCacheDomains == 0)
+        goto fail;
+    if (doms->numberOfCoresPerCache == 0)
+        goto fail;
+    if (doms->numberOfProcessorsPerCache == 0)
+        goto fail;
+    if (doms->numberOfProcessorsPerCache < doms->numberOfCoresPerCache)
+        goto fail;
+    if (doms->domains == NULL)
+        goto fail;
+    for (i = 0; i < doms->numberOfAffinityDomains; i++)
+    {
+        if (doms->domains[i].numberOfProcessors == 0)
+            goto fail;
+        if (doms->domains[i].numberOfCores == 0)
+            goto fail;
+        if (doms->domains[i].numberOfProcessors < doms->domains[i].numberOfCores)
+            goto fail;
+        if (doms->domains[i].processorList == NULL)
+            goto fail;
+    }
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    affinity_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_cpustring_logical()
+{
+    int test[5];
+    int len = 5;
+    int ret = cpustr_to_cpulist("S0:0-3", test, len);
+    if (ret < 0)
+    {
+        if (verbose) printf("Returned %d\n", ret);
+        return 0;
+    }
+    if (ret != 4)
+    {
+        if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+        return 0;
+    }
+    return 1;
+}
+
+int test_cpustring_physical()
+{
+    int test[5];
+    int len = 5;
+    int ret = cpustr_to_cpulist("0,1,2,3", test, len);
+    if (ret < 0)
+    {
+        if (verbose) printf("Returned %d\n", ret);
+        return 0;
+    }
+    if (ret != 4)
+    {
+        if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+        return 0;
+    }
+    return 1;
+}
+
+int test_cpustring_expression()
+{
+    int test[5];
+    int len = 5;
+    int ret = cpustr_to_cpulist("E:S0:4:1:2", test, len);
+    if (ret < 0)
+    {
+        if (verbose) printf("Returned %d\n", ret);
+        return 0;
+    }
+    if (ret != 4)
+    {
+        if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+        return 0;
+    }
+    return 1;
+}
+
+int test_cpustring_scatter()
+{
+    int test[100];
+    int len = 100;
+    int ret = cpustr_to_cpulist("S:scatter", test, len);
+    if (ret < 0)
+    {
+        if (verbose) printf("Returned %d\n", ret);
+        return 0;
+    }
+    CpuTopology_t cputopo = get_cpuTopology();
+    if (ret != cputopo->numHWThreads)
+    {
+        if (verbose) printf("Returned with %d not enough CPUs (%d)\n", ret, cputopo->numHWThreads);
+        return 0;
+    }
+    return 1;
+}
+
+int test_cpustring_combined()
+{
+    int test[100];
+    int len = 100;
+    int ret = cpustr_to_cpulist("N:0-3 at S0:0-3", test, len);
+    if (ret < 0)
+    {
+        if (verbose) printf("Returned %d\n", ret);
+        return 0;
+    }
+    if (ret != 8)
+    {
+        if (verbose) printf("Returned with %d not enough CPUs\n", ret);
+        return 0;
+    }
+    return 1;
+}
+
+int test_perfmoninit_faulty()
+{
+    int cpu = 0;
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    perfmon_finalize();
+    return 0;
+fail:
+    perfmon_finalize();
+    return 1;
+}
+
+int test_perfmoninit_valid()
+{
+    int cpu = 0;
+    topology_init();
+    affinity_init();
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    if (perfmon_getNumberOfGroups() != 0)
+        goto fail;
+    if (perfmon_getNumberOfThreads() != 1)
+        goto fail;
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmoninit()
+{
+    int cpu = 0;
+    int i;
+    topology_init();
+    affinity_init();
+    for(i=0;i<10;i++)
+    {
+        perfmon_init(1, &cpu);
+        perfmon_finalize();
+    }
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+}
+
+int test_perfmonfinalize()
+{
+    perfmon_finalize();
+    return 1;
+}
+
+int test_perfmonaddeventset()
+{
+    char eventset_fail1[] = "INSTR_RETIRED.ANY:FIXC0";
+    char eventset_fail2[] = "INSTR_RETIRED-ANY:FIXC0";
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon init failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfGroups() != 0) {
+        if (verbose > 0) printf("Perfmon number of groups != 0\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfThreads() != 1) {
+        if (verbose > 0) printf("Perfmon number of threads != 1\n");
+        goto fail;
+    }
+    if (perfmon_getIdOfActiveGroup() != -1) {
+        if (verbose > 0) printf("Perfmon id of active group != -1\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon addEventSet(ok) failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfGroups() != 1) {
+        if (verbose > 0) printf("Perfmon number of groups != 1\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfEvents(ret) != 3) {
+        if (verbose > 0) printf("Perfmon number of events != 3\n");
+        goto fail;
+    }
+    if (perfmon_getIdOfActiveGroup() != -1) {
+        if (verbose > 0) printf("Perfmon id of active group != -1\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(eventset_option);
+    if (ret != 1) {
+        if (verbose > 0) printf("Perfmon addEventSet(options) failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfGroups() != 2) {
+        if (verbose > 0) printf("Perfmon number of groups != 2\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfEvents(ret) != 3) {
+        if (verbose > 0) printf("Perfmon number of events != 3\n");
+        goto fail;
+    }
+    if (perfmon_getIdOfActiveGroup() != -1) {
+        if (verbose > 0) printf("Perfmon id of active group != -1\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(eventset_fail1);
+    if (ret >= 0) {
+        if (verbose > 0) printf("Perfmon addEventSet(fail1) failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfGroups() != 2) {
+        if (verbose > 0) printf("Perfmon number of groups != 2\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(eventset_fail2);
+    if (ret >= 0) {
+        if (verbose > 0) printf("Perfmon addEventSet(fail2) failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfGroups() != 2) {
+        if (verbose > 0) printf("Perfmon number of groups != 2\n");
+        goto fail;
+    }
+    if (perfmon_getIdOfActiveGroup() != -1) {
+        if (verbose > 0) printf("Perfmon id of active group != -1\n");
+        goto fail;
+    }
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonaddeventset_noinit()
+{
+    int ret = perfmon_addEventSet(eventset_ok);
+    if (ret == 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmoncustomgroup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon init failed\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon addEventSet(ok) failed\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfEvents(ret) != 3) {
+        if (verbose > 0) printf("Perfmon number of events != 3\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfMetrics(ret) != 0) {
+        if (verbose > 0) printf("Perfmon number of metrics != 0\n");
+        goto fail;
+    }
+    if (strcmp(perfmon_getEventName(ret, 0), event1_ok) != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getEventName(ret, 1), event2_ok) != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getEventName(ret, 2), event3_ok) != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getCounterName(ret, 0), ctr1_ok) != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getCounterName(ret, 1), ctr2_ok) != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getCounterName(ret, 2), ctr3_ok) != 0)
+    {
+        goto fail;
+    }
+
+    if (strcmp(perfmon_getGroupName(ret), "Custom") != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getGroupInfoShort(ret), "Custom") != 0)
+    {
+        goto fail;
+    }
+    if (strcmp(perfmon_getGroupInfoLong(ret), "Custom") != 0)
+    {
+        goto fail;
+    }
+    if (perfmon_getLastTimeOfGroup(ret) != 0)
+    {
+        goto fail;
+    }
+    
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmongetgroups()
+{
+    int i;
+    topology_init();
+    char** glist = NULL;
+    char** slist = NULL;
+    char** llist = NULL;
+    int ret = perfmon_getGroups(&glist, &slist, &llist);
+
+    if (ret <= 0)
+    {
+        goto fail;
+    }
+    for (i=0; i< ret; i++)
+    {
+        if (strcmp(glist[i], "") == 0)
+        {
+            goto fail;
+        }
+        if (strcmp(slist[i], "") == 0)
+        {
+            goto fail;
+        }
+        if (strcmp(llist[i], "") == 0)
+        {
+            goto fail;
+        }
+    }
+    perfmon_returnGroups(ret, glist, slist, llist);
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_returnGroups(ret, glist, slist, llist);
+    topology_finalize();
+    return 0;
+}
+
+int _test_perfmonperfgroup(char* perfgroup)
+{
+    CpuInfo_t cpuinfo;
+    int i;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon init failed\n");
+        goto fail;
+    }
+    ret = perfmon_addEventSet(perfgroup);
+    if (ret != 0) {
+        if (verbose > 0) printf("Perfmon addEventSet(%s) failed\n", perfgroup);
+        goto fail;
+    }
+    if (perfmon_getNumberOfEvents(ret) == 0) {
+        if (verbose > 0) printf("Perfmon number of events == 0\n");
+        goto fail;
+    }
+    if (perfmon_getNumberOfMetrics(ret) == 0) {
+        if (verbose > 0) printf("Perfmon number of metrics == 0\n");
+        goto fail;
+    }
+    for (i=0; i<perfmon_getNumberOfEvents(ret); i++) {
+        if (strcmp(perfmon_getEventName(ret, i), "") == 0)
+        {
+            if (verbose > 0) printf("Perfmon event name zero\n");
+            goto fail;
+        }
+        if (strcmp(perfmon_getCounterName(ret, i), "") == 0)
+        {
+            if (verbose > 0) printf("Perfmon counter name zero\n");
+            goto fail;
+        }
+    }
+    if (strcmp(perfmon_getGroupName(ret), "Custom") == 0)
+    {
+        if (verbose > 0) if (verbose > 0) printf("Perfmon groupName %s == %s\n", perfgroup, perfmon_getGroupName(ret));
+        goto fail;
+    }
+    if (strcmp(perfmon_getGroupInfoShort(ret), "Custom") == 0)
+    {
+        printf("Perfmon shortInfo %s == %s\n", perfgroup, perfmon_getGroupInfoShort(ret));
+        goto fail;
+    }
+    if (strcmp(perfmon_getGroupInfoLong(ret), "Custom") == 0)
+    {
+        if (verbose > 0) printf("Perfmon longInfo %s == %s\n", perfgroup, perfmon_getGroupInfoShort(ret));
+        goto fail;
+    }
+    if (perfmon_getLastTimeOfGroup(ret) != 0)
+    {
+        if (verbose > 0) printf("Perfmon last time of %s: %f\n", perfgroup, perfmon_getLastTimeOfGroup(ret));
+        goto fail;
+    }
+    if (perfmon_getTimeOfGroup(ret) != 0)
+    {
+        if (verbose > 0) printf("Perfmon time of %s: %f\n", perfgroup, perfmon_getTimeOfGroup(ret));
+        goto fail;
+    }
+    perfmon_setupCounters(ret);
+    perfmon_startCounters();
+    sleep(1);
+    perfmon_stopCounters();
+    for (i=0; i<perfmon_getNumberOfMetrics(ret); i++) {
+        if (strcmp(perfmon_getMetricName(ret, i), "") == 0)
+        {
+            if (verbose > 0) printf("Perfmon metric name zero\n");
+            goto fail;
+        }
+        double res = perfmon_getMetric(ret, i, 0);
+        if ((res != 0.0) && (res < 0))
+        {
+            if (verbose > 0) printf("Perfmon metric %s result %f\n", perfmon_getMetricName(ret, i), res );
+            goto fail;
+        }
+        double lastres = perfmon_getLastMetric(ret, i, 0);
+        if  ((ret >= 0) &&
+            (lastres >= 0) &&
+            (res != lastres))
+        {
+            if (verbose > 0) printf("Perfmon metric %s result %f not equal to last %f\n", perfmon_getMetricName(ret, i), res, lastres);
+            goto fail;
+        }
+    }
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    affinity_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonperfgroup_ok()
+{
+    return _test_perfmonperfgroup(perfgroup_ok);
+}
+
+int test_perfmonperfgroup_fail()
+{
+    return !_test_perfmonperfgroup(perfgroup_fail);
+}
+
+int test_perfmonsetup()
+{
+    CpuInfo_t cpuinfo;
+    int group1, group2;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    if (perfmon_getNumberOfGroups() != 0)
+        goto fail;
+    if (perfmon_getNumberOfThreads() != 1)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group1 = ret;
+    if (perfmon_getNumberOfGroups() != 1)
+        goto fail;
+    if (perfmon_getNumberOfEvents(group1) != 3)
+        goto fail;
+    ret = perfmon_setupCounters(group1);
+    if (ret != 0)
+        goto fail;
+    if (perfmon_getIdOfActiveGroup() != group1)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_option);
+    if (ret != 1)
+        goto fail;
+    group2 = ret;
+    if (perfmon_getIdOfActiveGroup() != group1)
+        goto fail;
+    if (perfmon_getNumberOfGroups() != 2)
+        goto fail;
+    if (perfmon_getNumberOfEvents(group1) != 3)
+        goto fail;
+    if (perfmon_getNumberOfEvents(group2) != 3)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonswitch()
+{
+    CpuInfo_t cpuinfo;
+    int group1, group2;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group1 = ret;
+    ret = perfmon_addEventSet(eventset_option);
+    if (ret != 1)
+        goto fail;
+    group2 = ret;
+    ret = perfmon_setupCounters(group1);
+    if (ret != 0)
+        goto fail;
+    if (perfmon_getIdOfActiveGroup() != group1)
+        goto fail;
+    ret = perfmon_switchActiveGroup(group2);
+    if (perfmon_getIdOfActiveGroup() != group2)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstart()
+{
+    CpuInfo_t cpuinfo;
+    int group1, group2;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group1 = ret;
+    ret = perfmon_setupCounters(group1);
+    if (ret != 0)
+        goto fail;
+    if (perfmon_getIdOfActiveGroup() != group1)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonsetup_noinit()
+{
+    int ret = perfmon_setupCounters(0);
+    if (ret == 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonsetup_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_setupCounters(0);
+    if (ret == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstart_noinit()
+{
+    int ret = perfmon_startCounters();
+    if (ret == 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonstart_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstop()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstop_noinit()
+{
+    int ret = perfmon_stopCounters();
+    if (ret == 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonstop_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_stopCounters();
+    if (ret == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstop_nosetup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_stopCounters();
+    if (ret == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonstop_nostart()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_stopCounters();
+    if (ret == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonresult_noinit()
+{
+    double result = perfmon_getResult(0,0,0);
+    if (result != 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonresult_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getResult(0,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonresult_nosetup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    double result = perfmon_getResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonresult_nostart()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonresult_nostop()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonresult()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    sleep(1);
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+        goto fail;
+    if ((perfmon_getResult(group,0,0) == 0)||(perfmon_getResult(group,1,0) == 0))
+        goto fail;
+    if (perfmon_getTimeOfGroup(group) == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastresult_noinit()
+{
+    double result = perfmon_getLastResult(0,0,0);
+    if (result != 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonlastresult_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastResult(0,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastresult_nosetup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    double result = perfmon_getLastResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastresult_nostart()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastresult_nostop()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastResult(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastresult()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    sleep(1);
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+        goto fail;
+    if ((perfmon_getLastResult(group,0,0) == 0)||(perfmon_getLastResult(group,1,0) == 0))
+        goto fail;
+    if (perfmon_getLastResult(group,0,0) != perfmon_getResult(group,0,0))
+        goto fail;
+    if (perfmon_getTimeOfGroup(group) == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonmetric_noinit()
+{
+    double result = perfmon_getMetric(0,0,0);
+    if (result != 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonmetric_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getMetric(0,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonmetric_nosetup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    double result = perfmon_getMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonmetric_nostart()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonmetric_nostop()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonmetric_ok()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(perfgroup_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    sleep(1);
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+        goto fail;
+    if ((perfmon_getMetric(group,0,0) == 0)||(perfmon_getMetric(group,1,0) == 0))
+        goto fail;
+    if (perfmon_getTimeOfGroup(group) == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastmetric_noinit()
+{
+    double result = perfmon_getLastMetric(0,0,0);
+    if (result != 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_perfmonlastmetric_noadd()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastMetric(0,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastmetric_nosetup()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    double result = perfmon_getLastMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastmetric_nostart()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastmetric_nostop()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(eventset_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    double result = perfmon_getLastMetric(group,0,0);
+    if (result != 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+int test_perfmonlastmetric_ok()
+{
+    CpuInfo_t cpuinfo;
+    int cpu = 0;
+    int group;
+    topology_init();
+    cpuinfo = get_cpuInfo();
+    if (cpuinfo->isIntel == 0)
+    {
+        topology_finalize();
+        return 1;
+    }
+    int ret = perfmon_init(1, &cpu);
+    if (ret != 0)
+        goto fail;
+    ret = perfmon_addEventSet(perfgroup_ok);
+    if (ret != 0)
+        goto fail;
+    group = ret;
+    ret = perfmon_setupCounters(group);
+    if (ret != 0)
+        goto fail;
+
+    ret = perfmon_startCounters();
+    if (ret != 0)
+        goto fail;
+    sleep(1);
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+        goto fail;
+    if ((perfmon_getLastMetric(group,0,0) == 0)||(perfmon_getLastMetric(group,1,0) == 0))
+        goto fail;
+    if (perfmon_getLastMetric(group,0,0) != perfmon_getMetric(group,0,0))
+        goto fail;
+    if (perfmon_getLastMetric(group,1,0) != perfmon_getMetric(group,1,0))
+        goto fail;
+    if (perfmon_getTimeOfGroup(group) == 0)
+        goto fail;
+    perfmon_finalize();
+    topology_finalize();
+    return 1;
+fail:
+    perfmon_finalize();
+    topology_finalize();
+    return 0;
+}
+
+
+
+int test_timerinit()
+{
+    timer_init();
+    uint64_t clock = timer_getCpuClock();
+    if (clock == 0)
+        goto fail;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timerfinalize()
+{
+    timer_finalize();
+    return 1;
+}
+
+int test_timerprint_noinit()
+{
+    TimerData timer;
+    timer_reset(&timer);
+    double time = timer_print(&timer);
+    if (time != 0)
+        goto fail;
+    return 1;
+fail:
+    return 0;
+}
+
+int test_timerprint()
+{
+    TimerData timer;
+    timer_reset(&timer);
+    timer_init();
+    double time = timer_print(&timer);
+    if (time != 0)
+        goto fail;
+    uint64_t cycles = timer_printCycles(&timer);
+    if (cycles != 0)
+        goto fail;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timerprint_start()
+{
+    TimerData timer;
+    timer_reset(&timer);
+    timer_init();
+    timer_start(&timer);
+    double time = timer_print(&timer);
+    if (time == 0)
+        goto fail;
+    uint64_t cycles = timer_printCycles(&timer);
+    if (cycles == 0)
+        goto fail;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timerprint_stop()
+{
+    TimerData timer;
+    timer_init();
+    timer_reset(&timer);
+    timer_start(&timer);
+    timer_stop(&timer);
+    double time = timer_print(&timer);
+    if (time > 1)
+        goto fail;
+    if (time == 0)
+        goto fail;
+    uint64_t cycles = timer_printCycles(&timer);
+    if (cycles == 0)
+        goto fail;
+    if (cycles > timer_getCpuClock())
+        goto fail;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timercpuclock_noinit()
+{
+    uint64_t cyc = timer_getCpuClock();
+    if (cyc != 0)
+        return 0;
+    return 1;
+}
+
+int test_timercpuclock()
+{
+    timer_init();
+    uint64_t cyc = timer_getCpuClock();
+    if (cyc == 0)
+        return 0;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timerbaseline_noinit()
+{
+    uint64_t cyc = timer_getBaseline();
+    if (cyc != 0)
+        return 0;
+    return 1;
+}
+
+int test_timerbaseline()
+{
+    timer_init();
+    uint64_t cyc = timer_getBaseline();
+    if (cyc == 0)
+        return 0;
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+int test_timersleep_noinit()
+{
+    timer_sleep(1E4);
+    return 1;
+}
+
+int test_timersleep()
+{
+    timer_init();
+    TimerData timer;
+    timer_start(&timer);
+    timer_sleep(1E6);
+    timer_stop(&timer);
+    if (timer_print(&timer) < 0.9E6*1E-6)
+    {
+        printf("Sleeping too short. timer is %f instead of 1 s\n", timer_print(&timer));
+        goto fail;
+    }
+    if (timer_print(&timer) > 1.1E6*1E-6)
+    {
+        printf("Sleeping too long. timer is %f instead of 1 s\n", 2E6*1E-6, timer_print(&timer));
+        goto fail;
+    }
+    timer_finalize();
+    return 1;
+fail:
+    timer_finalize();
+    return 0;
+}
+
+static test testlist[] = {
+    {"Test configuration initialization", test_initconfig, 1},
+    {"Enable configuration for following tests", enable_configuration, 1},
+    {"Test setting of access mode", test_hpmmode, 1},
+    {"Test access initialization", test_hpminit, 1},
+    {"Test adding CPU to access module", test_hpmaddthread, 1},
+    {"Disable configuration", disable_configuration, 1},
+    {"Test perfmon initialization without topology information", test_perfmoninit_faulty, 1},
+    {"Test topology module initialization", test_topologyinit, 1},
+    {"Test NUMA module initialization", test_numainit, 1},
+    {"Test affinity module initialization", test_affinityinit, 1},
+    {"Test perfmon initialization with topology information", test_perfmoninit_valid, 1},
+    {"Test adding event sets to perfmon module", test_perfmonaddeventset, 1},
+    {"Test adding event sets to perfmon module without initialization of perfmon", test_perfmonaddeventset_noinit, 1},
+    {"Test setting up an event set", test_perfmonsetup, 1},
+    {"Test switching between event sets", test_perfmonswitch, 1},
+    {"Test starting an event set", test_perfmonstart, 1},
+    {"Test setting up an event set without initialization", test_perfmonsetup_noinit, 1},
+    {"Test starting an event set without initialization", test_perfmonstart_noinit, 1},
+    {"Test setting up an event set without adding one", test_perfmonsetup_noadd, 1},
+    {"Test getting all performance groups", test_perfmongetgroups, 1},
+    {"Test setting up a custom event set and test group handling", test_perfmoncustomgroup, 1},
+    {"Test setting up a valid performance group and test group handling", test_perfmonperfgroup_ok, 1},
+    {"Test setting up a invalid performance group and test group handling", test_perfmonperfgroup_fail, 1},
+    {"Test starting an event set without adding one", test_perfmonstart_noadd, 1},
+    {"Test stopping an event set", test_perfmonstop, 1},
+    {"Test stopping an event set without initialization", test_perfmonstop_noinit, 1},
+    {"Test stopping an event set without adding one", test_perfmonstop_noadd, 1},
+    {"Test stopping an event set without setting one up", test_perfmonstop_nosetup, 1},
+    {"Test stopping an event set without starting one", test_perfmonstop_nostart, 1},
+    {"Test perfmon finalization", test_perfmonfinalize, 1},
+    {"Test perfmon result without initialization", test_perfmonresult_noinit, 1},
+    {"Test perfmon result without adding one", test_perfmonresult_noadd, 1},
+    {"Test perfmon result without setting up one", test_perfmonresult_nosetup, 1},
+    {"Test perfmon result without starting", test_perfmonresult_nostart, 1},
+    {"Test perfmon result without stopping", test_perfmonresult_nostop, 1},
+    {"Test perfmon result", test_perfmonresult, 1},
+    {"Test perfmon last result without initialization", test_perfmonlastresult_noinit, 1},
+    {"Test perfmon last result without adding one", test_perfmonlastresult_noadd, 1},
+    {"Test perfmon last result without setting up one", test_perfmonlastresult_nosetup, 1},
+    {"Test perfmon last result without starting", test_perfmonlastresult_nostart, 1},
+    {"Test perfmon last result without stopping", test_perfmonlastresult_nostop, 1},
+    {"Test perfmon last result", test_perfmonlastresult, 1},
+    {"Test initialization of timer module", test_timerinit, 1},
+    {"Test printing time without initialization", test_timerprint_noinit, 1},
+    {"Test printing time", test_timerprint, 1},
+    {"Test timer module finalization", test_timerfinalize, 1},
+    {"Test printing time for started clock", test_timerprint_start, 1},
+    {"Test printing time for started/stopped clock", test_timerprint_stop, 1},
+    {"Test reading cpu clock without initialization", test_timercpuclock_noinit, 1},
+    {"Test reading cpu clock", test_timercpuclock, 1},
+    {"Test reading baseline without initialization", test_timerbaseline_noinit, 1},
+    {"Test reading baseline", test_timerbaseline, 1},
+    {"Test sleeping with timer module without initialization", test_timersleep_noinit, 1},
+    {"Test sleeping with timer module", test_timersleep, 1},
+    {"Test perfmon metric without initialization", test_perfmonmetric_noinit, 1},
+    {"Test perfmon metric without adding one", test_perfmonmetric_noadd, 1},
+    {"Test perfmon metric without setting up one", test_perfmonmetric_nosetup, 1},
+    {"Test perfmon metric without starting", test_perfmonmetric_nostart, 1},
+    {"Test perfmon metric without stopping", test_perfmonmetric_nostop, 1},
+    {"Test perfmon metric", test_perfmonmetric_ok, 1},
+    {"Test perfmon last metric without initialization", test_perfmonlastmetric_noinit, 1},
+    {"Test perfmon last metric without adding one", test_perfmonlastmetric_noadd, 1},
+    {"Test perfmon last metric without setting up one", test_perfmonlastmetric_nosetup, 1},
+    {"Test perfmon last metric without starting", test_perfmonlastmetric_nostart, 1},
+    {"Test perfmon last metric without stopping", test_perfmonlastmetric_nostop, 1},
+    {"Test perfmon last metric", test_perfmonlastmetric_ok, 1},
+    {"Test cpustring with logical input", test_cpustring_logical, 1},
+    {"Test cpustring with physical input", test_cpustring_physical, 1},
+    {"Test cpustring with expression input", test_cpustring_expression, 1},
+    {"Test cpustring with scatter input", test_cpustring_scatter, 1},
+    {"Test cpustring with combined input", test_cpustring_combined, 1},
+    {NULL, NULL, 0},
+};
+
+int main()
+{
+    int i = 0;
+    //fclose(stderr);
+    if (verbose > 0) perfmon_setVerbosity(3);
+    while (testlist[i].testfunc != NULL)
+    {
+        printf("%s:\t", testlist[i].testname);
+        if (verbose > 0) printf("\n");
+        if (testlist[i].testfunc() != testlist[i].result)
+        {
+            printf("FAILED\n");
+            return 1;
+        }
+        printf("OK\n");
+        i++;
+    }
+    printf("All tests completed successfully.\n");
+    return 0;
+}
diff --git a/test/test-msr-access.c b/test/test-msr-access.c
new file mode 100644
index 0000000..1fea1ec
--- /dev/null
+++ b/test/test-msr-access.c
@@ -0,0 +1,101 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static char msr_name[] = "/dev/cpu/0/msr";
+static int msr_fd;
+
+int check_msr()
+{
+    if (access(msr_name, R_OK|W_OK))
+    {
+        fprintf(stderr,"Unable to access MSR device %s: %s\n", msr_name, strerror(errno));
+        return 1;
+    }
+    return 0;
+}
+
+int open_msr()
+{
+    msr_fd = open(msr_name, O_RDWR);
+    if (msr_fd < 0)
+    {
+        fprintf(stderr,"Cannot open MSR device %s: %s\n", msr_name, strerror(errno));
+        return 1;
+    }
+    return 0;
+}
+
+int close_msr()
+{
+    if (msr_fd > 0)
+    {
+        close(msr_fd);
+    }
+    return 0;
+}
+
+int read_msr()
+{
+    ssize_t ret;
+    uint64_t data = 0;
+    uint32_t reg = 0x38D;
+    if (msr_fd > 0)
+    {
+        ret = pread(msr_fd, &data, sizeof(uint64_t), reg);
+        if (ret < 0)
+        {
+            fprintf(stderr, "Cannot read register 0x%x at MSR %s: %s\n", reg, msr_name, strerror(errno));
+            return 1;
+        }
+        else if (ret != sizeof(uint64_t))
+        {
+            fprintf(stderr, "Incomplete read on register 0x%x at MSR %s: Only %lu bytes\n", reg, msr_name, ret);
+            return 1;
+        }
+        return 0;
+    }
+    return 1;
+}
+
+int write_msr()
+{
+    ssize_t ret;
+    uint64_t data = 0;
+    uint32_t reg = 0x38D;
+    if (msr_fd > 0)
+    {
+        ret = pwrite(msr_fd, &data, sizeof(uint64_t), reg);
+        if (ret < 0)
+        {
+            fprintf(stderr, "Cannot write register 0x%x at MSR %s: %s\n", reg, msr_name, strerror(errno));
+            return 1;
+        }
+        else if (ret != sizeof(uint64_t))
+        {
+            fprintf(stderr, "Incomplete read on register 0x%x at MSR %s: Only %lu bytes\n", reg, msr_name, ret);
+            return 1;
+        }
+        return 0;
+    }
+    return 1;
+}
+
+int main()
+{
+    int ret = 0;
+    if (check_msr()) return 1;
+    if (open_msr()) return 1;
+    if (read_msr()) return 1;
+    if (write_msr()) return 1;
+    if (close_msr()) return 1;
+    printf("All OK!\n");
+    return 0;
+}
diff --git a/test/testTBB.cc b/test/testTBB.cc
new file mode 100644
index 0000000..887400f
--- /dev/null
+++ b/test/testTBB.cc
@@ -0,0 +1,67 @@
+/*
+    File: testTBB.cc
+    Author: timday (stackoverflow)
+    Source: http://stackoverflow.com/questions/10607215/simplest-tbb-example
+
+    Extended by Thomas Roehl to do LIKWID Marker API calls and print the CPU for
+    the threads instead of 'n'
+*/
+
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/task_scheduler_init.h"
+#include <iostream>
+#include <vector>
+
+// Added by Thomas Roehl
+#include <sched.h>
+#include <likwid.h>
+
+
+struct mytask {
+  mytask(size_t n)
+    :_n(n)
+  {}
+  void operator()() {
+    
+    for (int i=0;i<10000000;++i) {}  // Deliberately run slow
+    std::cerr << "[" << sched_getcpu() << "]";
+    
+  }
+  size_t _n;
+};
+
+struct executor
+{
+  executor(std::vector<mytask>& t)
+    :_tasks(t)
+  {}
+  executor(executor& e,tbb::split)
+    :_tasks(e._tasks)
+  {}
+
+  void operator()(const tbb::blocked_range<size_t>& r) const {
+    LIKWID_MARKER_START("TBB");
+    for (size_t i=r.begin();i!=r.end();++i)
+      _tasks[i]();
+    LIKWID_MARKER_STOP("TBB");
+  }
+
+  std::vector<mytask>& _tasks;
+};
+
+int main(int,char**) {
+
+  tbb::task_scheduler_init init;  // Automatic number of threads
+
+  LIKWID_MARKER_INIT;
+  std::vector<mytask> tasks;
+  for (int i=0;i<1000;++i)
+    tasks.push_back(mytask(i));
+
+  executor exec(tasks);
+  tbb::parallel_for(tbb::blocked_range<size_t>(0,tasks.size()),exec);
+  std::cerr << std::endl;
+  LIKWID_MARKER_CLOSE;
+  return 0;
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/likwid/likwid.git



More information about the Likwid-commit mailing list